From 8e95bba69787bfc34ad63f60565f8f064b51cb88 Mon Sep 17 00:00:00 2001 From: Michal Piszczek Date: Mon, 22 Aug 2022 17:11:40 -0700 Subject: [PATCH 001/704] Remove mutable defaults in mlp_model (#12546) --- .../tvm/meta_schedule/cost_model/mlp_model.py | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/python/tvm/meta_schedule/cost_model/mlp_model.py b/python/tvm/meta_schedule/cost_model/mlp_model.py index 04ccca0563f9..e7f07f0a4542 100644 --- a/python/tvm/meta_schedule/cost_model/mlp_model.py +++ b/python/tvm/meta_schedule/cost_model/mlp_model.py @@ -26,7 +26,7 @@ import tempfile from collections import OrderedDict from itertools import chain as itertools_chain -from typing import Dict, List, NamedTuple, Tuple +from typing import Dict, List, NamedTuple, Optional, Tuple import numpy as np # type: ignore import torch # type: ignore @@ -418,8 +418,8 @@ def forward( # pylint: disable=missing-function-docstring def extract_features( context: TuneContext, candidates: List[MeasureCandidate], - results: List[RunnerResult] = None, - extractor: FeatureExtractor = PerStoreFeature(extract_workload=True), + results: Optional[List[RunnerResult]] = None, + extractor: Optional[FeatureExtractor] = None, ): """Extract feature vectors and compute mean costs. @@ -429,9 +429,9 @@ def extract_features( The tuning context. candidates: List[MeasureCandidate] The measure candidates. - results: List[RunnerResult] + results: Optional[List[RunnerResult]] The measured results, can be None if used in prediction. - extractor: FeatureExtractor + extractor: Optional[FeatureExtractor] The feature extractor. Returns @@ -441,6 +441,7 @@ def extract_features( new_mean_costs: np.ndarray The mean costs. """ + extractor = extractor or PerStoreFeature(extract_workload=True) def _feature(feature: NDArray) -> np.ndarray: return feature.numpy().astype("float32") @@ -481,9 +482,12 @@ class State: def __init__( self, - model_config: SegmentSumMLPConfig = SegmentSumMLPConfig(), - extractor: FeatureExtractor = PerStoreFeature(extract_workload=True), + model_config: Optional[SegmentSumMLPConfig] = None, + extractor: Optional[FeatureExtractor] = None, ): + model_config = model_config or SegmentSumMLPConfig() + extractor = extractor or PerStoreFeature(extract_workload=True) + self.model = SegmentSumMLP(**model_config.to_dict()) self.data = OrderedDict() self.data_size = 0 @@ -662,9 +666,12 @@ class SegmentSumMLPTrainer: def __init__( self, - train_config: TrainerConfig = TrainerConfig(), - state: State = State(), + train_config: Optional[TrainerConfig] = None, + state: Optional[State] = None, ): + train_config = train_config or TrainerConfig() + state = state or State() + config = train_config.to_dict() for attr in config: setattr(self, attr, config[attr]) @@ -676,7 +683,7 @@ def train_step( self, data: Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"], batch: int = 0, - train_loss: float = None, + train_loss: Optional[float] = None, ) -> float: """Helper function for training on a single batch. @@ -686,7 +693,7 @@ def train_step( A batch of data, should be a tuple of (segment_sizes, features, gt_results). batch: int = 0 The current batch number. - train_loss: float = None + train_loss: Optional[float] = None The previous averaged training loss, None if it is the first batch. Returns @@ -863,7 +870,7 @@ def train_incremental( def predict_incremental( self, features: List[np.ndarray], - results: np.ndarray = None, + results: Optional[np.ndarray] = None, ) -> np.ndarray: """Predicting (validating) on incremental data. @@ -871,7 +878,7 @@ def predict_incremental( ---------- features: List[np.ndarray] The extracted features. - results: np.ndarray + results: Optional[np.ndarray] The measured results, can be None if used for predicting. Returns @@ -943,10 +950,10 @@ class MLPModel(PyCostModel): def __init__( self, *, - trainer: SegmentSumMLPTrainer = SegmentSumMLPTrainer(), + trainer: Optional[SegmentSumMLPTrainer] = None, ): super().__init__() - self.trainer = trainer + self.trainer = trainer or SegmentSumMLPTrainer() def load(self, path: str) -> None: """Load the cost model, cached data or raw data from given file location. From 3bd168194f25c95904dac8835f8e74abd423a5a3 Mon Sep 17 00:00:00 2001 From: Mohamad Katanbaf Date: Mon, 22 Aug 2022 18:39:16 -0700 Subject: [PATCH 002/704] check for CMSIS_PATH in project generation (#12547) Co-authored-by: Mohamad --- .../zephyr/template_project/CMakeLists.txt.template | 3 ++- .../microtvm/zephyr/template_project/microtvm_api_server.py | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template index 742433e82d0d..b5182bf8ac1f 100644 --- a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template +++ b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template @@ -29,7 +29,7 @@ find_package(Zephyr HINTS $ENV{ZEPHYR_BASE}) project(microtvm_autogenerated_project) if(${ENABLE_CMSIS}) - set(CMSIS_PATH $ENV{CMSIS_PATH}) + set(CMSIS_PATH ) file(GLOB_RECURSE cmsis_lib_srcs ${CMSIS_PATH}/CMSIS/NN/Source/SoftmaxFunctions/*.c @@ -40,6 +40,7 @@ if(${ENABLE_CMSIS}) ) set(cmsis_includes + ${CMSIS_PATH}/CMSIS/Core/Include ${CMSIS_PATH}/CMSIS/NN/Include ${CMSIS_PATH}/CMSIS/DSP/Include ${CMSIS_PATH}/CMSIS/DSP/Include/dsp diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py index c55bd63fa4dd..eb20c3e88448 100644 --- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py +++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py @@ -463,6 +463,7 @@ def _create_prj_conf(self, project_dir, options): API_SERVER_CRT_LIBS_TOKEN = "" CMAKE_ARGS_TOKEN = "" QEMU_PIPE_TOKEN = "" + CMSIS_PATH_TOKEN = "" CRT_LIBS_BY_PROJECT_TYPE = { "host_driven": "microtvm_rpc_server microtvm_rpc_common aot_executor_module aot_executor common", @@ -521,6 +522,8 @@ def _generate_cmake_args(self, mlf_extracted_path, options) -> str: cmake_args += f"set(BOARD {options['zephyr_board']})\n" enable_cmsis = self._cmsis_required(mlf_extracted_path) + if enable_cmsis: + assert os.environ.get("CMSIS_PATH"), "CMSIS_PATH is not defined." cmake_args += f"set(ENABLE_CMSIS {str(enable_cmsis).upper()})\n" return cmake_args @@ -587,6 +590,9 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec self.qemu_pipe_dir = pathlib.Path(tempfile.mkdtemp()) line = line.replace(self.QEMU_PIPE_TOKEN, str(self.qemu_pipe_dir / "fifo")) + if self.CMSIS_PATH_TOKEN in line and self._cmsis_required(extract_path): + line = line.replace(self.CMSIS_PATH_TOKEN, str(os.environ["CMSIS_PATH"])) + cmake_f.write(line) if options.get("compile_definitions"): From 5cef6bf559265e74b84504ed2e190f29f5c5bf33 Mon Sep 17 00:00:00 2001 From: Gavin Uberti Date: Tue, 23 Aug 2022 08:58:54 +0700 Subject: [PATCH 003/704] [microTVM] Rework evaluate_model_accuracy into a more generic helper function (#12539) * Add workaround for #12538 * Rework evaluate_model_accuracy into predict_labels_aot --- python/tvm/micro/testing/__init__.py | 2 +- python/tvm/micro/testing/evaluation.py | 21 ++++++--------------- tests/micro/common/test_autotune.py | 13 +++++++------ 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/python/tvm/micro/testing/__init__.py b/python/tvm/micro/testing/__init__.py index 9062f061bda3..0dc24102cb89 100644 --- a/python/tvm/micro/testing/__init__.py +++ b/python/tvm/micro/testing/__init__.py @@ -16,5 +16,5 @@ # under the License. """Allows the tools specified below to be imported directly from tvm.micro.testing""" -from .evaluation import tune_model, create_aot_session, evaluate_model_accuracy +from .evaluation import tune_model, create_aot_session, predict_labels_aot from .utils import get_supported_boards, get_target diff --git a/python/tvm/micro/testing/evaluation.py b/python/tvm/micro/testing/evaluation.py index 5f47e06a17f9..32de1d2a370d 100644 --- a/python/tvm/micro/testing/evaluation.py +++ b/python/tvm/micro/testing/evaluation.py @@ -142,27 +142,18 @@ def create_aot_session( return tvm.micro.Session(project.transport(), timeout_override=timeout_override) -# This utility functions was designed ONLY for one input / one output models -# where the outputs are confidences for different classes. -def evaluate_model_accuracy(session, aot_executor, input_data, true_labels, runs_per_sample=1): - """Evaluates an AOT-compiled model's accuracy and runtime over an RPC session. Works well - when used with create_aot_session.""" +def predict_labels_aot(session, aot_executor, input_data, runs_per_sample=1): + """Predicts labels for each sample in input_data using host-driven AOT. + Returns an iterator of (label, runtime) tuples. This function can only + be used with models for which the output is the confidence for each class.""" assert aot_executor.get_num_inputs() == 1 assert aot_executor.get_num_outputs() == 1 assert runs_per_sample > 0 - predicted_labels = [] - aot_runtimes = [] for sample in input_data: aot_executor.get_input(0).copyfrom(sample) result = aot_executor.module.time_evaluator("run", session.device, number=runs_per_sample)() + predicted_label = aot_executor.get_output(0).numpy().argmax() runtime = result.mean - output = aot_executor.get_output(0).numpy() - predicted_labels.append(output.argmax()) - aot_runtimes.append(runtime) - - num_correct = sum(u == v for u, v in zip(true_labels, predicted_labels)) - average_time = sum(aot_runtimes) / len(aot_runtimes) - accuracy = num_correct / len(predicted_labels) - return average_time, accuracy, predicted_labels + yield predicted_label, runtime diff --git a/tests/micro/common/test_autotune.py b/tests/micro/common/test_autotune.py index 60b38ff211a4..b79260dd46ed 100644 --- a/tests/micro/common/test_autotune.py +++ b/tests/micro/common/test_autotune.py @@ -76,17 +76,18 @@ def test_kws_autotune_workflow(platform, board, tmp_path): np.random.randint(low=-127, high=128, size=(1, 1960), dtype=np.int8) for x in range(3) ) - labels = [0, 0, 0] - # Validate perforance across random runs - time, _, _ = tvm.micro.testing.evaluate_model_accuracy( - session, aot_executor, samples, labels, runs_per_sample=20 - ) + runtimes = [ + runtime + for _, runtime in tvm.micro.testing.predict_labels_aot( + session, aot_executor, samples, runs_per_sample=20 + ) + ] # `time` is the average time taken to execute model inference on the # device, measured in seconds. It does not include the time to upload # the input data via RPC. On slow boards like the Arduino Due, time # is around 0.12 (120 ms), so this gives us plenty of buffer. - assert time < 1 + assert np.median(runtimes) < 1 if __name__ == "__main__": From 58f2139ffdd39de61fcea3b090dcfa5f7d0db4be Mon Sep 17 00:00:00 2001 From: Gavin Uberti Date: Tue, 23 Aug 2022 08:59:26 +0700 Subject: [PATCH 004/704] [microTVM] Replace static fixtures with parameterization (#12530) * Replace microTVM static fixtures with parameterization * [microTVM] Only perform parameterization when fixture is present * Reformat with black * Fix Cortex-M tests * Add docstring to pytest_generate_tests * Remove trailing space from docstring --- python/tvm/micro/testing/pytest_plugin.py | 30 ++++++++++++++++---- python/tvm/micro/testing/utils.py | 5 ++++ tests/micro/arduino/test_arduino_workflow.py | 13 ++++++--- tests/micro/common/conftest.py | 16 ----------- 4 files changed, 39 insertions(+), 25 deletions(-) diff --git a/python/tvm/micro/testing/pytest_plugin.py b/python/tvm/micro/testing/pytest_plugin.py index 5c63711d28b3..9864b49abb61 100644 --- a/python/tvm/micro/testing/pytest_plugin.py +++ b/python/tvm/micro/testing/pytest_plugin.py @@ -26,14 +26,18 @@ from tvm.contrib.utils import tempdir -from .utils import get_supported_boards +from .utils import get_supported_platforms, get_supported_boards def pytest_addoption(parser): """Adds more pytest arguments""" + parser.addoption( + "--platform", + choices=get_supported_platforms(), + help=("microTVM platform for tests."), + ) parser.addoption( "--board", - required=True, choices=list(get_supported_boards("zephyr").keys()) + list(get_supported_boards("arduino").keys()), help=( @@ -58,9 +62,25 @@ def pytest_addoption(parser): ) -@pytest.fixture(scope="session") -def board(request): - return request.config.getoption("--board") +def pytest_generate_tests(metafunc): + """Hooks into pytest to add platform and board fixtures to tests that + require them. To make sure that "platform" and "board" are treated as + parameters for the appropriate tests (and included in the test names), + we add them as function level parametrizations. This prevents data + from being overwritten in Junit XML files if multiple platforms + or boards are tested.""" + + for argument in ["platform", "board"]: + if argument in metafunc.fixturenames: + value = metafunc.config.getoption(f"--{argument}", default=None) + + if not value: + raise ValueError( + f"Test {metafunc.function.__name__} in module {metafunc.module.__name__} " + f"requires a --{argument} argument, but none was given." + ) + + metafunc.parametrize(argument, [metafunc.config.getoption(f"--{argument}")]) @pytest.fixture(scope="session") diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py index 323108b253a2..794f443e47a6 100644 --- a/python/tvm/micro/testing/utils.py +++ b/python/tvm/micro/testing/utils.py @@ -33,6 +33,11 @@ TIMEOUT_SEC = 10 +@lru_cache(maxsize=None) +def get_supported_platforms(): + return ["arduino", "zephyr"] + + @lru_cache(maxsize=None) def get_supported_boards(platform: str): template = Path(tvm.micro.get_microtvm_template_projects(platform)) diff --git a/tests/micro/arduino/test_arduino_workflow.py b/tests/micro/arduino/test_arduino_workflow.py index 35bfa2556231..8d5d541d408c 100644 --- a/tests/micro/arduino/test_arduino_workflow.py +++ b/tests/micro/arduino/test_arduino_workflow.py @@ -37,9 +37,12 @@ """ # Since these tests are sequential, we'll use the same project/workspace -# directory for all tests in this file +# directory for all tests in this file. Note that --board can't be loaded +# from the fixture, since the fixture is function scoped (it has to be +# for the tests to be named correctly via parameterization). @pytest.fixture(scope="module") -def workflow_workspace_dir(request, board): +def workflow_workspace_dir(request): + board = request.config.getoption("--board") return test_utils.make_workspace_dir("arduino_workflow", board) @@ -48,9 +51,11 @@ def project_dir(workflow_workspace_dir): return workflow_workspace_dir / "project" -# We MUST pass workspace_dir, not project_dir, or the workspace will be dereferenced too soon +# We MUST pass workspace_dir, not project_dir, or the workspace will be dereferenced +# too soon. We can't use the board fixture either for the reason mentioned above. @pytest.fixture(scope="module") -def project(board, arduino_cli_cmd, microtvm_debug, workflow_workspace_dir): +def project(request, arduino_cli_cmd, microtvm_debug, workflow_workspace_dir): + board = request.config.getoption("--board") return test_utils.make_kws_project( board, arduino_cli_cmd, microtvm_debug, workflow_workspace_dir ) diff --git a/tests/micro/common/conftest.py b/tests/micro/common/conftest.py index 0bf70ed06138..d86fd41bd8bf 100644 --- a/tests/micro/common/conftest.py +++ b/tests/micro/common/conftest.py @@ -17,19 +17,3 @@ pytest_plugins = [ "tvm.micro.testing.pytest_plugin", ] - -import pytest - - -def pytest_addoption(parser): - parser.addoption( - "--platform", - required=True, - choices=["arduino", "zephyr"], - help="Platform to run tests with", - ) - - -@pytest.fixture -def platform(request): - return request.config.getoption("--platform") From e252d7f3ab6eac631c960cdcb7826862958c6e59 Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Mon, 22 Aug 2022 19:59:58 -0700 Subject: [PATCH 005/704] [docs] Add CI contribution instructions (#12551) This PR documents the steps to introducing a new CI docker image, which we've been doing a lot lately. --- docs/contribute/ci.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docs/contribute/ci.rst b/docs/contribute/ci.rst index a421103ab457..1284fd95fbea 100644 --- a/docs/contribute/ci.rst +++ b/docs/contribute/ci.rst @@ -174,6 +174,29 @@ The images for these containers are hosted in the `tlcpack Docker Hub `_. These can be inspected and run locally via standard Docker commands. +Adding a new Docker image +""""""""""""""""""""""""" + +New docker images can be added to test TVM on a variety of platforms. Here are the steps for adding +a new CI image: + +1. Define the ``docker/Dockerfile.ci_foo`` and associated scripts in ``docker/install``. Create a PR containing only these changes (no ``Jenkinsfile`` changes). + + Example: https://github.com/apache/tvm/pull/12230/files + +2. A committer verifies the image builds locally and then reviews/approves this PR. +3. A committer creates the ci-foo repos in https://hub.docker.com/u/tlcpack and https://hub.docker.com/u/tlcpackstaging. +4. Create a PR to create an ECR repo for the image in tlcpack/ci: https://github.com/tlc-pack/ci/pull/46/files +5. A committer creates and gets merged a PR to add the image to the ``Jenkinsfile`` + + Example: https://github.com/apache/tvm/pull/12369/files. + + **NOTE**: The PR must be opened from a branch in apache/tvm, not from a branch in a forked repo. + +6. A committer adds this image to the daily docker rebuild/validation run in tlcpack. + + Example: https://github.com/tlc-pack/tlcpack/pull/131 + ``ci-docker-staging`` ^^^^^^^^^^^^^^^^^^^^^ From d26bf809e4c3c8d6576d4e436475997eb12deb3e Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Tue, 23 Aug 2022 06:46:26 +0100 Subject: [PATCH 006/704] [ACL] Adjust mobilenet test for Keras 2.9 (#12541) In Keras 2.7, one "reshape" operator was removed from the Mobilenet model, making our test which verifies the number of operators to be incorrect. This patch adjusts the operator count so that it is in line with the changes in Keras. For reference, the change in keras repo was done in hash b6abfaed132 "Remove unnecessary reshape layer in MobileNet architecture". --- .../test_arm_compute_lib/test_network.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py index 8fcafe489cb9..b5b9ed6b6ef9 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_network.py +++ b/tests/python/contrib/test_arm_compute_lib/test_network.py @@ -16,6 +16,8 @@ # under the License. """Arm Compute Library network tests.""" +from distutils.version import LooseVersion + import numpy as np import pytest from tvm import testing @@ -111,6 +113,7 @@ def get_model(): def test_mobilenet(): + keras = pytest.importorskip("keras") Device.load("test_config.json") if skip_runtime_test(): @@ -131,8 +134,25 @@ def get_model(): mod, params = _get_keras_model(mobilenet, inputs) return mod, params, inputs + if keras.__version__ < LooseVersion("2.9"): + # This can be removed after we migrate to TF/Keras >= 2.9 + expected_tvm_ops = 56 + expected_acl_partitions = 31 + else: + # In Keras >= 2.7, one reshape operator was removed + # from the MobileNet model, so it impacted this test + # which now needs to be reduce in by 1 + # The change in Keras is `b6abfaed1326e3c` + expected_tvm_ops = 55 + expected_acl_partitions = 30 + _build_and_run_network( - *get_model(), device=device, tvm_ops=56, acl_partitions=31, atol=0.002, rtol=0.01 + *get_model(), + device=device, + tvm_ops=expected_tvm_ops, + acl_partitions=expected_acl_partitions, + atol=0.002, + rtol=0.01, ) From 3983a472c6f3ad4ad9604ceeffdf80cce01d166b Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Tue, 23 Aug 2022 07:37:39 +0100 Subject: [PATCH 007/704] [COMMUNITY] @konturn -> Reviewer (#12543) Co-authored-by: Leandro Nunes --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 7c6f2dfa7112..e3b4fe339a4f 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -121,6 +121,7 @@ We do encourage everyone to work anything they are interested in. - [Elen Kalda](https://github.com/ekalda): @ekalda - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - [Michael J. Klaiber](https://github.com/MichaelJKlaiber/) @MichaelJKlaiber +- [Noah Kontur](https://github.com/konturn/) @konturn - [Tristan Konolige](https://github.com/tkonolige): @tkonolige - [Denise Kutnick](https://github.com/denise-k): @denise-k - [Ruihang Lai](https://github.com/MasterJH5574): @MasterJH5574 From 383bd419310fac4d9d78e0c59760cbef3efa5555 Mon Sep 17 00:00:00 2001 From: Nicola Lancellotti Date: Tue, 23 Aug 2022 08:38:45 +0100 Subject: [PATCH 008/704] Fix TFLite 2.9 tests (#12130) This pr fixes the tests that will be broken when we will update TFLite to the 2.9 version. We will update TensorFlow and TFLite versions to 2.9 so that we can benefit from improvements in packaging to support multiple platforms and Operating Systems. --- python/tvm/relay/frontend/keras.py | 8 +++-- tests/python/frontend/tflite/test_forward.py | 33 +++++++++++++++----- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py index 3f7a96544a65..8c8a4a1ddcd3 100644 --- a/python/tvm/relay/frontend/keras.py +++ b/python/tvm/relay/frontend/keras.py @@ -635,9 +635,11 @@ def _convert_pooling( _op.nn.global_max_pool2d(inexpr, **global_pool_params), keras_layer, etab, data_layout ) if pool_type == "GlobalAveragePooling2D": - return _convert_flatten( - _op.nn.global_avg_pool2d(inexpr, **global_pool_params), keras_layer, etab, data_layout - ) + global_avg_pool2d = _op.nn.global_avg_pool2d(inexpr, **global_pool_params) + keep_dims = len(keras_layer.input.shape) == len(keras_layer.output.shape) + if keep_dims: + return global_avg_pool2d + return _convert_flatten(global_avg_pool2d, keras_layer, etab, data_layout) pool_h, pool_w = keras_layer.pool_size stride_h, stride_w = keras_layer.strides params = { diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py index 9121721d8ea2..7267b725483d 100644 --- a/tests/python/frontend/tflite/test_forward.py +++ b/tests/python/frontend/tflite/test_forward.py @@ -963,6 +963,10 @@ def representative_data_gen(): input_node = subgraph.Tensors(model_input).Name().decode("utf-8") tflite_output = run_tflite_graph(tflite_model_quant, data) + if tf.__version__ < LooseVersion("2.9"): + input_node = data_in.name.replace(":0", "") + else: + input_node = "serving_default_" + data_in.name + ":0" tvm_output = run_tvm_graph(tflite_model_quant, data, input_node) tvm.testing.assert_allclose( np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-2, atol=1e-2 @@ -1997,10 +2001,12 @@ def _test_abs(data, quantized, int_quant_dtype=tf.int8): # TFLite 2.6.x upgrade support if tf.__version__ < LooseVersion("2.6.1"): in_node = ["serving_default_input_int8"] - else: + elif tf.__version__ < LooseVersion("2.9"): in_node = ( ["serving_default_input_int16"] if int_quant_dtype == tf.int16 else ["tfl.quantize"] ) + else: + in_node = "serving_default_input" tvm_output = run_tvm_graph(tflite_model_quant, data, in_node) tvm.testing.assert_allclose( @@ -2028,8 +2034,10 @@ def _test_rsqrt(data, quantized, int_quant_dtype=tf.int8): tf.math.rsqrt, data, int_quant_dtype=int_quant_dtype ) tflite_output = run_tflite_graph(tflite_model_quant, data) - in_node = ["tfl.quantize"] - + if tf.__version__ < LooseVersion("2.9"): + in_node = ["tfl.quantize"] + else: + in_node = "serving_default_input" tvm_output = run_tvm_graph(tflite_model_quant, data, in_node) tvm.testing.assert_allclose( np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2 @@ -2110,7 +2118,10 @@ def _test_cos(data, quantized, int_quant_dtype=tf.int8): tf.math.cos, data, int_quant_dtype=int_quant_dtype ) tflite_output = run_tflite_graph(tflite_model_quant, data) - in_node = ["tfl.quantize"] + if tf.__version__ < LooseVersion("2.9"): + in_node = ["tfl.quantize"] + else: + in_node = "serving_default_input" tvm_output = run_tvm_graph(tflite_model_quant, data, in_node) tvm.testing.assert_allclose( np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2 @@ -3024,7 +3035,6 @@ def _test_quantize_dequantize(data): add = tf.keras.layers.Add()([data_in, relu]) concat = tf.keras.layers.Concatenate(axis=0)([relu, add]) keras_model = tf.keras.models.Model(inputs=data_in, outputs=concat) - input_name = data_in.name.split(":")[0] # To create quantized values with dynamic range of activations, needs representative dataset def representative_data_gen(): @@ -3034,7 +3044,11 @@ def representative_data_gen(): tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen, True, True) tflite_output = run_tflite_graph(tflite_model_quant, data) - tvm_output = run_tvm_graph(tflite_model_quant, data, input_name) + if tf.__version__ < LooseVersion("2.9"): + in_node = data_in.name.split(":")[0] + else: + in_node = "serving_default_" + data_in.name + ":0" + tvm_output = run_tvm_graph(tflite_model_quant, data, in_node) tvm.testing.assert_allclose( np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2 ) @@ -3051,7 +3065,6 @@ def _test_quantize_dequantize_const(data): add = tf.keras.layers.Add()([data, relu]) concat = tf.keras.layers.Concatenate(axis=0)([relu, add]) keras_model = tf.keras.models.Model(inputs=data_in, outputs=concat) - input_name = data_in.name.split(":")[0] # To create quantized values with dynamic range of activations, needs representative dataset def representative_data_gen(): @@ -3061,7 +3074,11 @@ def representative_data_gen(): tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen, True, True) tflite_output = run_tflite_graph(tflite_model_quant, data) - tvm_output = run_tvm_graph(tflite_model_quant, data, input_name) + if tf.__version__ < LooseVersion("2.9"): + in_node = data_in.name.split(":")[0] + else: + in_node = "serving_default_" + data_in.name + ":0" + tvm_output = run_tvm_graph(tflite_model_quant, data, in_node) tvm.testing.assert_allclose( np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2 ) From 52779f1273b05d53d8213e23e70d9b0ac82fd0b9 Mon Sep 17 00:00:00 2001 From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com> Date: Tue, 23 Aug 2022 10:00:34 +0100 Subject: [PATCH 009/704] [CMSIS-NN] Pad fusion with QNN Conv2D (#12353) Pass that fuses nn.pad and qnn.conv2d for CMSIS-NN target. --- python/tvm/relay/op/contrib/cmsisnn.py | 50 ++- .../backend/contrib/cmsisnn/fuse_pads.cc | 209 +++++++++++ .../contrib/test_cmsisnn/test_conv2d.py | 277 ++++++++++++-- .../contrib/test_cmsisnn/test_fuse_pads.py | 340 ++++++++++++++++++ tests/python/contrib/test_cmsisnn/utils.py | 45 ++- 5 files changed, 886 insertions(+), 35 deletions(-) create mode 100644 src/relay/backend/contrib/cmsisnn/fuse_pads.cc create mode 100644 tests/python/contrib/test_cmsisnn/test_fuse_pads.py diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py index 8d714b7269d9..b887fafd7e00 100644 --- a/python/tvm/relay/op/contrib/cmsisnn.py +++ b/python/tvm/relay/op/contrib/cmsisnn.py @@ -59,6 +59,7 @@ def partition_for_cmsisnn(mod, params=None, mod_name="default", **opts): transform.AnnotateTarget("cmsis-nn"), transform.PartitionGraph(mod_name=mod_name), GenerateCMSISNNConstants(), + CMSISNNFusePads(), ScalarToTensorConstants(), ExtractConstantsFromPartitionedFunction(), transform.InferType(), @@ -91,10 +92,18 @@ def check_qnn_softmax(pattern): and dequantize_call.args[0].checked_type.dtype == "int8" ) - def qnn_conv2d_pattern(): - """Create pattern for qnn.conv2D with optional fused relu.""" + def qnn_conv2d_pattern(with_pad): + """Create pattern for qnn.conv2D with optional pad and/or optional fused relu.""" + conv2d_input = wildcard() + if with_pad: + conv2d_input = is_op("nn.pad")(wildcard(), is_constant()) qnn_conv2d = is_op("qnn.conv2d")( - wildcard(), is_constant(), is_constant(), is_constant(), is_constant(), is_constant() + conv2d_input, + is_constant(), + is_constant(), + is_constant(), + is_constant(), + is_constant(), ) bias_add = is_op("nn.bias_add")(qnn_conv2d, is_constant()) req = is_op("qnn.requantize")( @@ -136,7 +145,7 @@ def check_qnn_conv2d(pattern): ): is_depthwise = True - return ( + ret = ( conv2d.attrs.out_dtype == "int32" and conv2d_input.checked_type.dtype == "int8" and conv2d_weight.checked_type.dtype == "int8" @@ -145,6 +154,36 @@ def check_qnn_conv2d(pattern): and all([zp == 0 for zp in kernel_zp]) and (not is_depthwise or bias_add is not None) ) + return ret + + def check_qnn_conv2d_pad(pattern): + """Check if the Pad followed by Conv2D is supported by CMSIS-NN.""" + if str(pattern.op.name) == "clip": + relu = pattern + requantize = relu.args[0] + else: + requantize = pattern + requantize_input = requantize.args[0] + if str(requantize_input.op.name) == "nn.bias_add": + bias_add = requantize_input + conv2d = bias_add.args[0] + else: + conv2d = requantize_input + conv2d_input = conv2d.args[0] + + # check if sum of paddings from pad() and conv2d() satisfies CMSIS-NN constraints + can_pad_be_fused = True + if isinstance(conv2d_input, tvm.relay.expr.Call) and str(conv2d_input.op.name) == "nn.pad": + pad_top, pad_left, pad_bottom, pad_right = GetEffectiveConv2DPadding( + conv2d, conv2d_input + ) + # check if difference in the side paddings is 1 along each dimension + pad_w_diff = int(pad_right - pad_left) + pad_h_diff = int(pad_bottom - pad_top) + can_pad_be_fused = pad_w_diff in [0, 1] and pad_h_diff in [0, 1] + + ret = check_qnn_conv2d(pattern) and can_pad_be_fused + return ret def qnn_fully_connected_pattern(): """Create pattern for qnn.dense with optional Relu.""" @@ -275,7 +314,8 @@ def check_qnn_binary_op(pattern): ) return [ - ("cmsis-nn.qnn_conv2d", qnn_conv2d_pattern(), check_qnn_conv2d), + ("cmsis-nn.qnn_conv2d", qnn_conv2d_pattern(with_pad=True), check_qnn_conv2d_pad), + ("cmsis-nn.qnn_conv2d", qnn_conv2d_pattern(with_pad=False), check_qnn_conv2d), ("cmsis-nn.qnn_fully_connected", qnn_fully_connected_pattern(), check_qnn_fully_connected), ("cmsis-nn.qnn_avg_pool2d", qnn_avg_pool2d_pattern(), check_qnn_avg_pool2d), ("cmsis-nn.qnn_max_pool2d", qnn_max_pool2d_pattern(), check_qnn_max_pool2d), diff --git a/src/relay/backend/contrib/cmsisnn/fuse_pads.cc b/src/relay/backend/contrib/cmsisnn/fuse_pads.cc new file mode 100644 index 000000000000..71c31c303588 --- /dev/null +++ b/src/relay/backend/contrib/cmsisnn/fuse_pads.cc @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/*! + * \file src/relay/backend/contrib/cmsisnn/fuse_pads.cc + * \brief Fuses pads that precede qnn.conv2d ops inside CMSIS-NN composite functions. + */ + +#include +#include +#include +#include +#include + +#include "../../../op/make_op.h" +#include "../../../qnn/utils.h" +#include "../../../transforms/pattern_utils.h" +#include "convolutions.h" + +namespace tvm { +namespace relay { +namespace contrib { +namespace cmsisnn { + +inline IntImm ToIntImm(int32_t value) { return IntImm(DataType::Int(32), value); } + +/*! + * \brief From padding attributes of nn.pad and qnn.conv2d, calculates effective padding along H + * and W dimensions. + */ +Array GetEffectiveConv2DPadding(Expr conv2d, Expr pad) { + // pad_width: ((), (top, bottom), (left, right), ()) for NHWC layout + // conv2d_attrs->padding: (top, left, bottom, right) + auto* conv2d_call = conv2d.as(); + auto* conv2d_attrs = conv2d_call->attrs.as(); + std::string data_layout = conv2d_attrs->data_layout.c_str(); + int pos_h = data_layout.find("H"); + int pos_w = data_layout.find("W"); + + auto* pad_call = pad.as(); + Array> pad_width = pad_call->attrs.as()->pad_width; + int pad_top = + qnn::get_const_int(conv2d_attrs->padding[0]) + qnn::get_const_int(pad_width[pos_h][0]); + int pad_left = + qnn::get_const_int(conv2d_attrs->padding[1]) + qnn::get_const_int(pad_width[pos_w][0]); + int pad_bottom = + qnn::get_const_int(conv2d_attrs->padding[2]) + qnn::get_const_int(pad_width[pos_h][1]); + int pad_right = + qnn::get_const_int(conv2d_attrs->padding[3]) + qnn::get_const_int(pad_width[pos_w][1]); + + return {ToIntImm(pad_top), ToIntImm(pad_left), ToIntImm(pad_bottom), ToIntImm(pad_right)}; +} + +/*! + * \brief This Mutator will find all partitioned functions meant for CMSIS-NN Conv2D. + * Then, it will fuse preceding pads with qnn.conv2d. + */ +class FusePadsMutator : public MixedModeMutator { + public: + explicit FusePadsMutator(const IRModule& mod) : mod_(mod) {} + + private: + /*! + * \brief In order to eliminate preceding nn.pad op, pad_width of nn.pad is passed onto + * convolution layer to update Conv2DAttrs's padding attribute. */ + void UpdateConv2DPadding(const CallNode* conv2d_call, const CallNode* pad_call, + Attrs* new_attrs) { + Array effective_padding = + GetEffectiveConv2DPadding(GetRef(conv2d_call), GetRef(pad_call)); + int pad_top = effective_padding[0]->value; + int pad_left = effective_padding[1]->value; + int pad_bottom = effective_padding[2]->value; + int pad_right = effective_padding[3]->value; + int pad_diff_w = pad_right - pad_left; + int pad_diff_h = pad_bottom - pad_top; + bool can_pad_be_fused = + ((pad_diff_w == 0 || pad_diff_w == 1) && (pad_diff_h == 0 || pad_diff_h == 1)); + std::string error = "Difference on each side of a dimension should be either 0 or 1. "; + error += "Effective padding in this case: (pad_top, pad_left, pad_bottom, pad_right)=("; + error += std::to_string(pad_top); + error += ", "; + error += std::to_string(pad_left); + error += ", "; + error += std::to_string(pad_bottom); + error += ", "; + error += std::to_string(pad_right); + error += ")"; + ICHECK(can_pad_be_fused) << error; + + // Prepare new attrs as padding has changed + auto* conv2d_attrs = conv2d_call->attrs.as(); + auto attrs = make_object(); + attrs->strides = std::move(conv2d_attrs->strides); + attrs->dilation = std::move(conv2d_attrs->dilation); + attrs->groups = conv2d_attrs->groups; + attrs->channels = std::move(conv2d_attrs->channels); + attrs->kernel_size = std::move(conv2d_attrs->kernel_size); + attrs->data_layout = std::move(conv2d_attrs->data_layout); + attrs->kernel_layout = std::move(conv2d_attrs->kernel_layout); + attrs->out_layout = std::move(conv2d_attrs->out_layout); + attrs->out_dtype = std::move(conv2d_attrs->out_dtype); + attrs->padding = {pad_top, pad_left, pad_bottom, pad_right}; + *new_attrs = tvm::Attrs{attrs}; + } + + /*! + * \brief Identifies the sequence for qnn.conv2D and fuses the preceding nn.pad present within the + * CMSIS-NN partitioned function. */ + Expr FusePadConv2d(const CallNode* conv2d_call) { + // create new paddings for qnn.conv2d + tvm::Attrs new_conv2d_attrs = conv2d_call->attrs; + Expr new_conv2d_input = conv2d_call->args[0]; + if (auto* pad_call = conv2d_call->args[0].as()) { + if (auto* pad_call_op = pad_call->op.as()) { + if (pad_call_op->name == "nn.pad") { + new_conv2d_input = pad_call->args[0]; + UpdateConv2DPadding(conv2d_call, pad_call, &new_conv2d_attrs); + } + } + } + + // Conv2D arguments: pad's input + rest of the origin args + auto new_conv2d_args = conv2d_call->args; + new_conv2d_args.erase(new_conv2d_args.begin()); + new_conv2d_args.insert(new_conv2d_args.begin(), new_conv2d_input); + Call ret_call = Call(conv2d_call->op, new_conv2d_args, new_conv2d_attrs, {}); + return std::move(ret_call); + } + + Expr Rewrite_(const CallNode* call, const Expr& post) final { + Expr ret_call = post; + auto* post_call = post.as(); + + // Fuse nn.pad and qnn.conv2d + if (auto* conv2d_op = post_call->op.as()) { + if (conv2d_op->name == "qnn.conv2d") { + ret_call = FusePadConv2d(post_call); + } + } + + // Identify qnn.conv2d partitioned function + if (post_call->op.as()) { + auto* func = call->op.as(); + auto func_name = func->GetAttr(attr::kComposite); + if (func_name.defined() && func_name == "cmsis-nn.qnn_conv2d") { + Expr new_body = VisitExpr(func->body); + Function new_func = Function(FreeVars(new_body), new_body, func->ret_type, + FreeTypeVars(new_body, mod_), func->attrs); + ret_call = Call(new_func, post_call->args); + } + } + + return ret_call; + } + + private: + IRModule mod_; +}; + +IRModule FusePads(const IRModule& mod) { + for (auto gv : mod->GetGlobalVars()) { + Function func = Downcast(mod->Lookup(gv)); + + // only mutate CMSIS-NN partitioned functions + auto compiler_name = func->GetAttr(attr::kCompiler); + if (!compiler_name.defined() || compiler_name != "cmsis-nn") { + continue; + } + + auto fuse_pads_mutator = FusePadsMutator(mod); + auto new_func_body = fuse_pads_mutator.VisitExpr(func->body); + if (!new_func_body.same_as(func->body)) { + Function new_func = + Function(func->params, new_func_body, func->ret_type, func->type_params, func->attrs); + mod->Update(gv, new_func); + } + } + return mod; +} + +transform::Pass CMSISNNFusePads() { + runtime::TypedPackedFunc pass_func = + [=](IRModule m, transform::PassContext pc) { return FusePads(m); }; + return tvm::transform::CreateModulePass(pass_func, 0, "CMSISNNFusePads", {}); +} + +TVM_REGISTER_GLOBAL("relay.ext.cmsisnn.transform.CMSISNNFusePads").set_body_typed(CMSISNNFusePads); +TVM_REGISTER_GLOBAL("relay.ext.cmsisnn.transform.GetEffectiveConv2DPadding") + .set_body_typed(GetEffectiveConv2DPadding); + +} // namespace cmsisnn +} // namespace contrib +} // namespace relay +} // namespace tvm diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py index 502743387bfa..d33d71261613 100644 --- a/tests/python/contrib/test_cmsisnn/test_conv2d.py +++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py @@ -40,6 +40,7 @@ assert_partitioned_function, assert_no_external_function, create_test_runner, + CheckForPadsWithinCompositeFunc, ) @@ -62,23 +63,21 @@ def make_model( weight_format, enable_bias, relu_type, + input_op=None, ): """Return a model and any parameters it may have""" + if input_op: + op = input_op + else: + op = relay.var("input", shape=shape, dtype=dtype) + h_index = weight_format.index("H") w_index = weight_format.index("W") kernel_h = kernel_shape[h_index] kernel_w = kernel_shape[w_index] - invar = relay.var("input", shape=shape, dtype=dtype) p = (0, 0, 0, 0) if padding == "SAME": p = get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides) - invar = relay.nn.pad( - invar, - pad_width=[(0, 0), (p[0], p[2]), (p[1], p[3]), (0, 0)], - pad_value=input_zero_point, - pad_mode="constant", - ) - shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3]) rng = np.random.default_rng(12321) weight = tvm.nd.array( @@ -92,7 +91,7 @@ def make_model( weight_const = relay.const(weight, kernel_dtype) conv2d_kernel_sc = kernel_scale[0] if out_channels == 1 else kernel_scale conv = relay.qnn.op.conv2d( - invar, + op, weight_const, input_zero_point=relay.const(input_zero_point, "int32"), kernel_zero_point=relay.const(kernel_zero_point, "int32"), @@ -165,9 +164,9 @@ def test_conv2d_number_primfunc_args( input_zero_point, kernel_scale, kernel_zero_point, - dtype, - dtype, - dtype, + input_dtype=dtype, + weights_dtype=dtype, + output_dtype=dtype, ) model, params = make_model( @@ -265,9 +264,9 @@ def test_conv2d_symmetric_padding_int8( input_zero_point, kernel_scale, kernel_zero_point, - dtype, - dtype, - dtype, + input_dtype=dtype, + weights_dtype=dtype, + output_dtype=dtype, ) model, params = make_model( @@ -355,9 +354,110 @@ def test_conv2d_asymmetric_padding_int8( input_zero_point, kernel_scale, kernel_zero_point, + input_dtype=dtype, + weights_dtype=dtype, + output_dtype=dtype, + ) + + model, params = make_model( + ifm_shape, + kernel_shape, + input_zero_point, + input_scale, + kernel_zero_point, + kernel_scale, + output_zero_point, + output_scale, + padding, + strides, + dilation, + groups, dtype, dtype, - dtype, + out_channels, + weight_format, + enable_bias, + relu_type, + ) + orig_mod = make_module(model) + cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params) + # validate pattern matching + assert_partitioned_function(orig_mod, cmsisnn_mod) + + # validate the output + rng = np.random.default_rng(12345) + inputs = {"input": rng.integers(in_min, high=in_max, size=ifm_shape, dtype=dtype)} + output_list = generate_ref_data(orig_mod["main"], inputs, params) + compile_and_run( + AOTTestModel( + module=cmsisnn_mod, + inputs=inputs, + outputs=output_list, + params=params, + output_tolerance=1, + ), + test_runner, + interface_api, + use_unpacked_api, + ) + + +@tvm.testing.requires_cmsisnn +@pytest.mark.parametrize("ifm_shape", [(1, 25, 25, 12), (1, 64, 100, 4)]) +@pytest.mark.parametrize( + "pad_width", + [ + ((0, 0), (0, 1), (1, 2), (0, 0)), + ((0, 0), (1, 1), (1, 1), (0, 0)), + ((0, 0), (2, 2), (3, 4), (0, 0)), + ], +) +def test_pad_conv2d_fusion_int8( + ifm_shape, + pad_width, +): + """Tests QNN Conv2D where the padding is asymmetric on different sides of input""" + interface_api = "c" + use_unpacked_api = True + test_runner = AOT_USMP_CORSTONE300_RUNNER + + ifm_shape = (1, 25, 25, 12) + kernel_size = (5, 5) + strides = (2, 2) + dilation = (1, 1) + padding = "SAME" + dtype = "int8" + enable_bias = True + relu_type = "NONE" + input_zero_point = 10 + input_scale = 0.0128 + kernel_scale = [0.11, 0.22] + out_channels = 2 + groups = 1 + weight_format = "HWIO" + kernel_h = kernel_size[0] + kernel_w = kernel_size[1] + kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels) + kernel_zero_point = 0 + in_min, in_max = get_range_for_dtype_str(dtype) + + output_scale, output_zero_point = get_conv2d_qnn_params( + kernel_shape, + input_scale, + input_zero_point, + kernel_scale, + kernel_zero_point, + input_dtype=dtype, + weights_dtype=dtype, + output_dtype=dtype, + ) + + invar = relay.var("input", shape=ifm_shape, dtype=dtype) + pad = relay.nn.pad( + invar, + pad_width=pad_width, # ((), (top, bottom), (left, right), ()) + pad_value=input_zero_point, + pad_mode="constant", ) model, params = make_model( @@ -379,12 +479,139 @@ def test_conv2d_asymmetric_padding_int8( weight_format, enable_bias, relu_type, + input_op=pad, ) orig_mod = make_module(model) cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params) + + # validate pattern matching + assert_partitioned_function(orig_mod, cmsisnn_mod, False) + + # check pad is not present inside CMSIS-NN partitioned function + cmsisnn_func = None + for var in cmsisnn_mod.get_global_vars(): + if "cmsis_nn_main_0" in var.name_hint: + cmsisnn_func = cmsisnn_mod[var] + pad_verifier = CheckForPadsWithinCompositeFunc() + pad_verifier.visit_function(cmsisnn_func) + pad_verifier.assert_no_pads_within_func() + + # validate the output + rng = np.random.default_rng(12345) + inputs = {"input": rng.integers(in_min, high=in_max, size=ifm_shape, dtype=dtype)} + output_list = generate_ref_data(orig_mod["main"], inputs, params) + compile_and_run( + AOTTestModel( + module=cmsisnn_mod, + inputs=inputs, + outputs=output_list, + params=params, + output_tolerance=1, + ), + test_runner, + interface_api, + use_unpacked_api, + ) + + +@tvm.testing.requires_cmsisnn +@pytest.mark.parametrize( + "ifm_shape, pad_width, conv2d_padding", + [ + [(1, 25, 25, 12), ((0, 0), (0, 2), (1, 2), (0, 0)), "SAME"], + [(1, 64, 100, 4), ((0, 0), (1, 3), (1, 1), (0, 0)), "VALID"], + [(1, 55, 55, 3), ((0, 0), (2, 1), (3, 5), (0, 0)), "SAME"], + ], +) +def test_invalid_pad_conv2d_fusion_int8( + ifm_shape, + pad_width, + conv2d_padding, +): + """Tests QNN Conv2D where the padding is asymmetric on different sides of input""" + interface_api = "c" + use_unpacked_api = True + test_runner = AOT_USMP_CORSTONE300_RUNNER + + ifm_shape = (1, 25, 25, 12) + kernel_size = (5, 5) + strides = (2, 2) + dilation = (1, 1) + dtype = "int8" + enable_bias = True + relu_type = "NONE" + input_zero_point = 10 + input_scale = 0.0128 + kernel_scale = [0.11, 0.22] + out_channels = 2 + groups = 1 + weight_format = "HWIO" + kernel_h = kernel_size[0] + kernel_w = kernel_size[1] + kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels) + kernel_zero_point = 0 + in_min, in_max = get_range_for_dtype_str(dtype) + + output_scale, output_zero_point = get_conv2d_qnn_params( + kernel_shape, + input_scale, + input_zero_point, + kernel_scale, + kernel_zero_point, + input_dtype=dtype, + weights_dtype=dtype, + output_dtype=dtype, + ) + + invar = relay.var("input", shape=ifm_shape, dtype=dtype) + pad = relay.nn.pad( + invar, + pad_width=pad_width, # ((), (top, bottom), (left, right), ()) + pad_value=input_zero_point, + pad_mode="constant", + ) + + model, params = make_model( + ifm_shape, + kernel_shape, + input_zero_point, + input_scale, + kernel_zero_point, + kernel_scale, + output_zero_point, + output_scale, + conv2d_padding, + strides, + dilation, + groups, + dtype, + dtype, + out_channels, + weight_format, + enable_bias, + relu_type, + input_op=pad, + ) + orig_mod = make_module(model) + cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params) + # validate pattern matching assert_partitioned_function(orig_mod, cmsisnn_mod) + # check pad is only present inside main function + cmsisnn_func = None + for var in cmsisnn_mod.get_global_vars(): + if "cmsis_nn_main_0" in var.name_hint: + cmsisnn_func = cmsisnn_mod[var] + pad_verifier = CheckForPadsWithinCompositeFunc() + pad_verifier.visit_function(cmsisnn_func) + pad_verifier.assert_no_pads_within_func() + else: + main_func = cmsisnn_mod[var] + pad_verifier = CheckForPadsWithinCompositeFunc() + pad_verifier.visit_function(main_func) + pad_verifier.assert_pads_within_func() + # validate the output rng = np.random.default_rng(12345) inputs = {"input": rng.integers(in_min, high=in_max, size=ifm_shape, dtype=dtype)} @@ -506,10 +733,10 @@ def test_depthwise_int8( input_zero_point, kernel_scale, kernel_zero_point, - dtype, - dtype, - dtype, - True, + input_dtype=dtype, + weights_dtype=dtype, + output_dtype=dtype, + is_depthwise=True, ) model, params = make_model( @@ -611,10 +838,10 @@ def test_relay_conv2d_cmsisnn_depthwise_int8( input_zero_point, kernel_scale, kernel_zero_point, - dtype, - dtype, - dtype, - True, + input_dtype=dtype, + weights_dtype=dtype, + output_dtype=dtype, + is_depthwise=True, ) model, params = make_model( @@ -729,7 +956,7 @@ def test_invalid_parameters( in_dtype, kernel_dtype, in_dtype, - False, + is_depthwise=False, ) model, params = make_model( shape=ifm_shape, diff --git a/tests/python/contrib/test_cmsisnn/test_fuse_pads.py b/tests/python/contrib/test_cmsisnn/test_fuse_pads.py new file mode 100644 index 000000000000..f57dc5cd5bab --- /dev/null +++ b/tests/python/contrib/test_cmsisnn/test_fuse_pads.py @@ -0,0 +1,340 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""CMSIS-NN integration tests: fuse_pads pass""" +import numpy as np +import pytest +import tvm +import tvm.testing +from tvm import relay +from .utils import CheckForPadsWithinCompositeFunc + +tvm._ffi._init_api("relay.ext.cmsisnn.transform", __name__) + + +def set_external_func_attr(func, compiler, ext_symbol): + func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1)) + func = func.with_attr("Compiler", compiler) + func = func.with_attr("global_symbol", ext_symbol) + return func + + +def set_composite_func_attr(func, name): + func = func.with_attr("Composite", name) + return func + + +@pytest.mark.parametrize( + "ifm_shape, pad_width, conv2d_padding, ofm_shape", + [ + [(1, 25, 25, 12), ((0, 0), (0, 2), (1, 2), (0, 0)), (1, 1, 1, 1), (1, 26, 28, 2)], + [(1, 64, 100, 4), ((0, 0), (1, 3), (1, 1), (0, 0)), (0, 0, 0, 0), (1, 64, 100, 2)], + [(1, 55, 55, 3), ((0, 0), (2, 1), (3, 5), (0, 0)), (0, 0, 1, 1), (1, 57, 59, 2)], + ], +) +def test_invalid_padding_for_fusion(ifm_shape, pad_width, conv2d_padding, ofm_shape): + """Negative tests for pads preceding Conv2D that cannot be fused.""" + dtype = "int8" + kernel_size = (3, 3) + ofm_channels = 2 + local_input = relay.var("local_input", shape=ifm_shape, dtype=dtype) + pad = relay.nn.pad( + local_input, + pad_width=pad_width, # ((), (top, bottom), (left, right), ()) + pad_value=10, + pad_mode="constant", + ) + rng = np.random.default_rng(12321) + local_weight = tvm.nd.array( + rng.integers( + np.iinfo(dtype).min, + high=np.iinfo(dtype).max, + size=(ofm_channels, kernel_size[0], kernel_size[1], ifm_shape[3]), + dtype=dtype, + ) + ) + local_weight = relay.const(local_weight, dtype) + conv2d = relay.qnn.op.conv2d( + pad, + local_weight, + relay.const(1, "int32"), + relay.const(1, "int32"), + relay.const(1, "float32"), + relay.const(1, "float32"), + data_layout="NHWC", + kernel_layout="OHWI", + channels=ofm_channels, + kernel_size=(3, 3), + padding=conv2d_padding, + out_dtype="int32", + ) + requantize = relay.qnn.op.requantize( + conv2d, + relay.const(1, "float32"), + relay.const(1, "int32"), + relay.const(1, "float32"), + relay.const(1, "int32"), + axis=0, + out_dtype=dtype, + ) + local_func = relay.Function(relay.analysis.free_vars(requantize), requantize) + local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_conv2d") + + mod = tvm.IRModule() + ext_input = relay.var("ext_input", shape=ifm_shape, dtype=dtype) + call_local_func = relay.Call(local_func, [ext_input]) + extern_func = relay.Function(relay.analysis.free_vars(call_local_func), call_local_func) + extern_var = relay.GlobalVar("external_function") + extern_func = set_external_func_attr(extern_func, "cmsis-nn", extern_var.name_hint) + mod[extern_var] = extern_func + + main_input = relay.var("main_input", shape=ifm_shape, dtype=dtype) + call_extern_func = relay.Call(extern_var, [main_input]) + main_func = relay.Function([main_input], call_extern_func, relay.TensorType(ofm_shape, dtype)) + main_var = relay.GlobalVar("main") + mod[main_var] = main_func + + mod = relay.transform.InferType()(mod) + + error_regex = r"Difference on each side of a dimension should be either 0 or 1" + + with pytest.raises(tvm.TVMError, match=error_regex): + mod = CMSISNNFusePads()(mod) + + +@pytest.mark.parametrize( + "ifm_shape, pad_width, conv2d_padding, ofm_shape", + [ + [(1, 25, 25, 12), ((0, 0), (0, 1), (1, 2), (0, 0)), (1, 1, 1, 1), (1, 26, 28, 2)], + [(1, 64, 100, 4), ((0, 0), (1, 1), (1, 1), (0, 0)), (0, 0, 0, 0), (1, 64, 100, 2)], + [(1, 55, 55, 3), ((0, 0), (2, 1), (3, 2), (0, 0)), (0, 0, 1, 1), (1, 57, 59, 2)], + ], +) +def test_pad_conv2d_fusion_noncmsisnn_target(ifm_shape, pad_width, conv2d_padding, ofm_shape): + """Tests the pads and conv2d fusion for non-cmsisnn targets. + It is expected that pad will not be fused with Conv2D in this case. + """ + dtype = "int8" + kernel_size = (3, 3) + ofm_channels = 2 + local_input = relay.var("local_input", shape=ifm_shape, dtype=dtype) + pad = relay.nn.pad( + local_input, + pad_width=pad_width, # ((), (top, bottom), (left, right), ()) + pad_value=10, + pad_mode="constant", + ) + rng = np.random.default_rng(12321) + local_weight = tvm.nd.array( + rng.integers( + np.iinfo(dtype).min, + high=np.iinfo(dtype).max, + size=(ofm_channels, kernel_size[0], kernel_size[1], ifm_shape[3]), + dtype=dtype, + ) + ) + local_weight = relay.const(local_weight, dtype) + conv2d = relay.qnn.op.conv2d( + pad, + local_weight, + relay.const(1, "int32"), + relay.const(1, "int32"), + relay.const(1, "float32"), + relay.const(1, "float32"), + data_layout="NHWC", + kernel_layout="OHWI", + channels=ofm_channels, + kernel_size=(3, 3), + padding=conv2d_padding, + out_dtype="int32", + ) + requantize = relay.qnn.op.requantize( + conv2d, + relay.const(1, "float32"), + relay.const(1, "int32"), + relay.const(1, "float32"), + relay.const(1, "int32"), + axis=0, + out_dtype=dtype, + ) + local_func = relay.Function(relay.analysis.free_vars(requantize), requantize) + local_func = set_composite_func_attr(local_func, "noncmsis-nn.qnn_conv2d") + + mod = tvm.IRModule() + ext_input = relay.var("ext_input", shape=ifm_shape, dtype=dtype) + call_local_func = relay.Call(local_func, [ext_input]) + extern_func = relay.Function(relay.analysis.free_vars(call_local_func), call_local_func) + extern_var = relay.GlobalVar("external_function") + extern_func = set_external_func_attr(extern_func, "noncmsis-nn", extern_var.name_hint) + mod[extern_var] = extern_func + + main_input = relay.var("main_input", shape=ifm_shape, dtype=dtype) + call_extern_func = relay.Call(extern_var, [main_input]) + main_func = relay.Function([main_input], call_extern_func, relay.TensorType(ofm_shape, dtype)) + main_var = relay.GlobalVar("main") + mod[main_var] = main_func + + mod = relay.transform.InferType()(mod) + + mod = CMSISNNFusePads()(mod) + pad_verifier = CheckForPadsWithinCompositeFunc() + pad_verifier.visit_function(mod[extern_var]) + pad_verifier.assert_pads_within_func() + + +@pytest.mark.parametrize( + "ifm_shape, pad_width, conv2d_padding, ofm_shape", + [ + [(1, 25, 25, 12), ((0, 0), (0, 1), (1, 2), (0, 0)), (1, 1, 1, 1), (1, 26, 28, 2)], + [(1, 64, 100, 4), ((0, 0), (1, 1), (1, 1), (0, 0)), (0, 0, 0, 0), (1, 64, 100, 2)], + [(1, 55, 55, 3), ((0, 0), (2, 1), (3, 2), (0, 0)), (0, 0, 1, 1), (1, 57, 59, 2)], + ], +) +def test_pad_conv2d_fusion(ifm_shape, pad_width, conv2d_padding, ofm_shape): + """Tests the pads and conv2d fusion.""" + dtype = "int8" + kernel_size = (3, 3) + ofm_channels = 2 + local_input = relay.var("local_input", shape=ifm_shape, dtype=dtype) + pad = relay.nn.pad( + local_input, + pad_width=pad_width, # ((), (top, bottom), (left, right), ()) + pad_value=10, + pad_mode="constant", + ) + rng = np.random.default_rng(12321) + local_weight = tvm.nd.array( + rng.integers( + np.iinfo(dtype).min, + high=np.iinfo(dtype).max, + size=(ofm_channels, kernel_size[0], kernel_size[1], ifm_shape[3]), + dtype=dtype, + ) + ) + local_weight = relay.const(local_weight, dtype) + conv2d = relay.qnn.op.conv2d( + pad, + local_weight, + relay.const(1, "int32"), + relay.const(1, "int32"), + relay.const(1, "float32"), + relay.const(1, "float32"), + data_layout="NHWC", + kernel_layout="OHWI", + channels=ofm_channels, + kernel_size=(3, 3), + padding=conv2d_padding, + out_dtype="int32", + ) + requantize = relay.qnn.op.requantize( + conv2d, + relay.const(1, "float32"), + relay.const(1, "int32"), + relay.const(1, "float32"), + relay.const(1, "int32"), + axis=0, + out_dtype=dtype, + ) + local_func = relay.Function(relay.analysis.free_vars(requantize), requantize) + local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_conv2d") + + mod = tvm.IRModule() + ext_input = relay.var("ext_input", shape=ifm_shape, dtype=dtype) + call_local_func = relay.Call(local_func, [ext_input]) + extern_func = relay.Function(relay.analysis.free_vars(call_local_func), call_local_func) + extern_var = relay.GlobalVar("external_function") + extern_func = set_external_func_attr(extern_func, "cmsis-nn", extern_var.name_hint) + mod[extern_var] = extern_func + + main_input = relay.var("main_input", shape=ifm_shape, dtype=dtype) + call_extern_func = relay.Call(extern_var, [main_input]) + main_func = relay.Function([main_input], call_extern_func, relay.TensorType(ofm_shape, dtype)) + main_var = relay.GlobalVar("main") + mod[main_var] = main_func + + mod = relay.transform.InferType()(mod) + + mod = CMSISNNFusePads()(mod) + pad_verifier = CheckForPadsWithinCompositeFunc() + pad_verifier.visit_function(mod[extern_var]) + pad_verifier.assert_no_pads_within_func() + + +def test_without_preceding_pad(): + """Tests the pass FusePads when padding is not present before qnn.conv2d.""" + dtype = "int8" + ifm_shape = (1, 56, 56, 64) + ofm_shape = (1, 56, 56, 64) + local_input = relay.var("local_input", shape=ifm_shape, dtype=dtype) + rng = np.random.default_rng(12321) + local_weight = tvm.nd.array( + rng.integers( + np.iinfo(dtype).min, + high=np.iinfo(dtype).max, + size=(64, 3, 3, 64), + dtype=dtype, + ) + ) + local_weight = relay.const(local_weight, dtype) + conv2d = relay.qnn.op.conv2d( + local_input, + local_weight, + relay.const(1, "int32"), + relay.const(1, "int32"), + relay.const(1, "float32"), + relay.const(1, "float32"), + data_layout="NHWC", + kernel_layout="OHWI", + channels=64, + kernel_size=(3, 3), + padding=(1, 1, 1, 1), + out_dtype="int32", + ) + requantize = relay.qnn.op.requantize( + conv2d, + relay.const(1, "float32"), + relay.const(1, "int32"), + relay.const(1, "float32"), + relay.const(1, "int32"), + axis=0, + out_dtype=dtype, + ) + relu = relay.nn.relu(requantize) + local_func = relay.Function(relay.analysis.free_vars(relu), relu) + local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_conv2d") + + mod = tvm.IRModule() + ext_input = relay.var("ext_input", shape=ifm_shape, dtype=dtype) + call_local_func = relay.Call(local_func, [ext_input]) + extern_func = relay.Function(relay.analysis.free_vars(call_local_func), call_local_func) + extern_var = relay.GlobalVar("external_function") + extern_func = set_external_func_attr(extern_func, "cmsis-nn", extern_var.name_hint) + mod[extern_var] = extern_func + + main_input = relay.var("main_input", shape=ifm_shape, dtype=dtype) + call_extern_func = relay.Call(extern_var, [main_input]) + main_func = relay.Function(relay.analysis.free_vars(call_extern_func), call_extern_func) + main_func = relay.Function([main_input], call_extern_func, relay.TensorType(ofm_shape, dtype)) + main_var = relay.GlobalVar("main") + mod[main_var] = main_func + + mod = relay.transform.InferType()(mod) + + mod = CMSISNNFusePads()(mod) + pad_verifier = CheckForPadsWithinCompositeFunc() + pad_verifier.visit_function(mod[extern_var]) + pad_verifier.assert_no_pads_within_func() diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py index d36ec4219a0e..9fdb89289aff 100644 --- a/tests/python/contrib/test_cmsisnn/utils.py +++ b/tests/python/contrib/test_cmsisnn/utils.py @@ -50,8 +50,19 @@ def visit_call(self, call): return counter.count -def assert_partitioned_function(orig_mod, cmsisnn_mod): - """If kCompiler attribute is missing, this function raises assertion""" +def assert_partitioned_function(orig_mod, cmsisnn_mod, expected_ops_unchanged=True): + """ + if KCompiler attribute is missing, this function raises an assertion. + + Parameters + ---------- + orig_mod : IRModule + Pre-partitioning module + cmsisnn_mod : IRModule + Post-partitioning module + is_num_calls_same: bool + Are number of CallNode(s) before and after partitioning expected to be the same + """ attrs = [ cmsisnn_mod[var.name_hint].attrs for var in cmsisnn_mod.get_global_vars() @@ -64,9 +75,10 @@ def assert_partitioned_function(orig_mod, cmsisnn_mod): ] assert any(compilers), "Module does not contain function for cmsisnn target." - assert count_num_calls(orig_mod) == count_num_calls( - cmsisnn_mod - ), "Number of calls changed during partitioning" + if expected_ops_unchanged: + assert count_num_calls(orig_mod) == count_num_calls( + cmsisnn_mod + ), "Number of calls changed during partitioning" def assert_no_external_function(mod): @@ -228,6 +240,29 @@ def make_qnn_relu(expr, fused_activation_fn, scale, zero_point, dtype): raise ValueError("Invalid argument provided with fused_activation_fn") +class CheckForPadsWithinCompositeFunc(tvm.relay.ExprVisitor): + """Provides method to test number of pads present inside the function being visited.""" + + def __init__(self): + super().__init__() + self.num_pads_ = 0 + + def visit_call(self, call): + super().visit_call(call) + if ( + isinstance(call, tvm.relay.Call) + and isinstance(call.op, tvm.ir.op.Op) + and call.op.name == "nn.pad" + ): + self.num_pads_ += 1 + + def assert_no_pads_within_func(self): + assert self.num_pads_ == 0, "CMSIS-NN composite function should not have pads." + + def assert_pads_within_func(self): + assert self.num_pads_ > 0, "Composite function should have pads within it." + + def create_test_runner(compiler_cpu="cortex-m55", cpu_flags=""): """ Creates AOT test runner for CMSIS-NN tests. From d27167838888ca79eb53cd16449ae4483c9b6249 Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Tue, 23 Aug 2022 14:03:04 +0100 Subject: [PATCH 010/704] [CI][AArch64] Skip libgomp failures in integration tests (#12554) Some integration tests are failing when running in CI machines that have torch installed (validated only in AARch64 for now), with an error message related to libgomp, similar to the one above: OSError: /.../dist-packages/torch/lib/libgomp-d22c30c5.so.1: cannot allocate memory in static TLS block As part of enabling the integration tests in AArch64, I'm marking this tests as skipped, so that tests can start executing and don't regress while we take time to investigate these specific failures. --- tests/python/driver/tvmc/test_autotuner.py | 9 +++++++++ tests/python/driver/tvmc/test_frontends.py | 9 +++++++++ tests/python/driver/tvmc/test_model.py | 4 ++++ 3 files changed, 22 insertions(+) diff --git a/tests/python/driver/tvmc/test_autotuner.py b/tests/python/driver/tvmc/test_autotuner.py index 66017823a669..7c05ff804fa4 100644 --- a/tests/python/driver/tvmc/test_autotuner.py +++ b/tests/python/driver/tvmc/test_autotuner.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import platform import pytest import os @@ -73,6 +74,10 @@ def test_get_tuning_tasks(onnx_mnist): assert all([type(x) is expected_task_type for x in sut]) is True +@pytest.mark.skipif( + platform.machine() == "aarch64", + reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673", +) def test_tune_tasks__tuner__xgb(onnx_mnist, tmpdir_factory): pytest.importorskip("onnx") @@ -141,6 +146,10 @@ def test_tune_tasks__tuner__xgb__no_early_stopping(onnx_mnist, tmpdir_factory): _tuner_test_helper(onnx_mnist, "xgb", tmpdir_name, early_stopping=None) +@pytest.mark.skipif( + platform.machine() == "aarch64", + reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673", +) def test_tune_tasks__tuner__xgb__no_tuning_records(onnx_mnist, tmpdir_factory): pytest.importorskip("onnx") diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py index 0cd02181ac40..98659b05ae5c 100644 --- a/tests/python/driver/tvmc/test_frontends.py +++ b/tests/python/driver/tvmc/test_frontends.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +import platform import pytest import builtins import importlib @@ -74,6 +75,10 @@ def test_guess_frontend_onnx(): assert type(sut) is tvmc.frontends.OnnxFrontend +@pytest.mark.skipif( + platform.machine() == "aarch64", + reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673", +) def test_guess_frontend_pytorch(): # some CI environments wont offer pytorch, so skip in case it is not present pytest.importorskip("torch") @@ -245,6 +250,10 @@ def test_load_model__pth(pytorch_resnet18): assert "layer1.0.conv1.weight" in tvmc_model.params.keys() +@pytest.mark.skipif( + platform.machine() == "aarch64", + reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673", +) def test_load_quantized_model__pth(pytorch_mobilenetv2_quantized): # some CI environments wont offer torch, so skip in case it is not present pytest.importorskip("torch") diff --git a/tests/python/driver/tvmc/test_model.py b/tests/python/driver/tvmc/test_model.py index 74c1c4ded8a4..fb1f718c1bed 100644 --- a/tests/python/driver/tvmc/test_model.py +++ b/tests/python/driver/tvmc/test_model.py @@ -55,6 +55,10 @@ def test_tvmc_workflow(use_vm, keras_simple): assert "output_0" in result.outputs.keys() +@pytest.mark.skipif( + platform.machine() == "aarch64", + reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673", +) @pytest.mark.parametrize("use_vm", [True, False]) def test_save_load_model(use_vm, keras_simple, tmpdir_factory): pytest.importorskip("onnx") From ff46fa15e063ef499f666e63b9d5ed3faf2e3bfb Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Tue, 23 Aug 2022 14:28:12 +0100 Subject: [PATCH 011/704] [ETHOSN] Fix requantize output conversion (#12540) Fixes a small issue when converting the output information to the support library API. The `requantize_info` output datatype needed updating with the output datatype from the relay function to ensure the graph is compiled correctly by the support library. Included a test to prevent regression in the future. --- .../backend/contrib/ethosn/ethosn_api.cc | 22 ++++--- .../contrib/test_ethosn/test_requantize.py | 63 +++++++++++++++++++ 2 files changed, 75 insertions(+), 10 deletions(-) diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc index c828762096d6..55d0b57bcc2f 100644 --- a/src/relay/backend/contrib/ethosn/ethosn_api.cc +++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc @@ -678,11 +678,17 @@ EthosnError EthosnAPI::Relu(const Expr& expr, ReluParams* params) { EthosnError EthosnAPI::Requantize(const Expr& expr, RequantizeParams* params) { Call call = Downcast(expr); - const auto* input_dtype = call->args[0]->checked_type().as(); + const auto* input_ttype = call->args[0]->checked_type().as(); sl::TensorShape input_tensor_shape = {1, 1, 1, 1}; sl::DataType input_data_type; - EthosnError err = Tvm2Npu(input_dtype->shape, &input_tensor_shape); - err += Tvm2Npu(input_dtype->dtype, &input_data_type); + EthosnError err = Tvm2Npu(input_ttype->shape, &input_tensor_shape); + err += Tvm2Npu(input_ttype->dtype, &input_data_type); + + const auto* output_ttype = call->checked_type().as(); + sl::TensorShape output_tensor_shape = {1, 1, 1, 1}; + sl::DataType output_data_type; + err += Tvm2Npu(output_ttype->shape, &output_tensor_shape); + err += Tvm2Npu(output_ttype->dtype, &output_data_type); float input_sc, output_sc; int input_zp, output_zp; @@ -699,14 +705,10 @@ EthosnError EthosnAPI::Requantize(const Expr& expr, RequantizeParams* params) { sl::QuantizationInfo requantize_q_info; err += Tvm2Npu(output_zp, output_sc, &requantize_q_info); params->requantize_info = sl::RequantizeInfo(requantize_q_info); + params->requantize_info.m_OutputDataType = output_data_type; - sl::TensorInfo output_info = params->input_info; - output_info.m_QuantizationInfo = params->requantize_info.m_OutputQuantizationInfo; - if (params->requantize_info.m_OutputDataType.has_value()) { - output_info.m_DataType = params->requantize_info.m_OutputDataType.value(); - } - params->output_info = output_info; - + params->output_info = sl::TensorInfo(output_tensor_shape, output_data_type, sl::DataFormat::NHWC, + requantize_q_info); return err; } diff --git a/tests/python/contrib/test_ethosn/test_requantize.py b/tests/python/contrib/test_ethosn/test_requantize.py index 4626a0d92bc1..e20c3beeabfa 100644 --- a/tests/python/contrib/test_ethosn/test_requantize.py +++ b/tests/python/contrib/test_ethosn/test_requantize.py @@ -68,6 +68,69 @@ def test_requantize(in_dtype, out_dtype, shape): tei.verify(outputs, out_dtype, 1) +@requires_ethosn +def test_requantize_mixed_precision_with_following_op(): + """ + Checks a requantize operation that changes precision from uint8 to int8 with a + following add op. + """ + np.random.seed(0) + shape = (1, 4, 6, 8) + in_sc = 0.012566 + in_zp = 131 + out_sc = 0.012566 + out_zp = 3 + in_dtype = "uint8" + out_dtype = "int8" + + def get_model(): + a = relay.var("a", shape=shape, dtype=in_dtype) + b = relay.var("b", shape=shape, dtype=out_dtype) + req = relay.qnn.op.requantize( + data=a, + input_scale=relay.const(in_sc, "float32"), + input_zero_point=relay.const(in_zp, "int32"), + output_scale=relay.const(out_sc, "float32"), + output_zero_point=relay.const(out_zp, "int32"), + out_dtype=out_dtype, + ) + req = relay.qnn.op.add( + req, + b, + lhs_scale=relay.const(out_sc, "float32"), + lhs_zero_point=relay.const(out_zp, "int32"), + rhs_scale=relay.const(out_sc, "float32"), + rhs_zero_point=relay.const(out_zp, "int32"), + output_scale=relay.const(out_sc, "float32"), + output_zero_point=relay.const(out_zp, "int32"), + ) + return req + + inputs = { + "a": tvm.nd.array( + np.random.randint( + low=np.iinfo(in_dtype).min, high=np.iinfo(in_dtype).max, size=shape, dtype=in_dtype + ) + ), + "b": tvm.nd.array( + np.random.randint( + low=np.iinfo(out_dtype).min, + high=np.iinfo(out_dtype).max, + size=shape, + dtype=out_dtype, + ) + ), + } + outputs = [] + for npu in [False, True]: + model = get_model() + mod = tei.make_module(model, {}) + x = tei.build_and_run(mod, inputs, 1, {}, npu=npu) + outputs.append(x) + + tei.verify(outputs, out_dtype, 1) + + @requires_ethosn def test_requantize_failure(): input_sc = 0.8 From dd7ae2d3e5a7e169021e75d0c9d0f6a8cc477a9c Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Tue, 23 Aug 2022 09:51:04 -0600 Subject: [PATCH 012/704] [Relay] Add Rsqrt to SimplifyExpr (#12363) * Add Rsqrt to SimplifyExpr * fix unit tests --- python/tvm/relay/op/_tensor.py | 1 + python/tvm/relay/op/contrib/dnnl.py | 3 ++- src/relay/transforms/simplify_expr.cc | 24 +++++++++++++++++++ tests/python/relay/test_pass_simplify_expr.py | 19 +++++++++++++++ 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py index 37cb263c489d..a04199f6a5b1 100644 --- a/python/tvm/relay/op/_tensor.py +++ b/python/tvm/relay/op/_tensor.py @@ -292,6 +292,7 @@ def elemwise_shape_func(attrs, inputs, _): register_shape_func("right_shift", False, broadcast_shape_func) register_shape_func("sqrt", False, elemwise_shape_func) +register_shape_func("rsqrt", False, elemwise_shape_func) register_shape_func("negative", False, elemwise_shape_func) register_shape_func("exp", False, elemwise_shape_func) register_shape_func("tan", False, elemwise_shape_func) diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py index 4ef342a26b0b..f7752e41b056 100644 --- a/python/tvm/relay/op/contrib/dnnl.py +++ b/python/tvm/relay/op/contrib/dnnl.py @@ -856,7 +856,8 @@ def __init__(self): added_eps = is_op("add")(mp1, eps) deno = is_op("sqrt")(added_eps) div_out = is_op("divide")(diff, deno) - weighted = is_op("multiply")(div_out, self.gamma) + div_out2 = diff * is_op("rsqrt")(added_eps) + weighted = is_op("multiply")(div_out | div_out2, self.gamma) added_bias = is_op("add")(weighted, self.beta) self.pattern = added_bias diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc index 04d0edb26d75..a6751933a88c 100644 --- a/src/relay/transforms/simplify_expr.cc +++ b/src/relay/transforms/simplify_expr.cc @@ -685,6 +685,29 @@ class SimplifyConsecutiveAdd : public DFPatternRewrite { DFPattern const2_; }; +class SimplifyRSqrt : public DFPatternRewrite { + public: + SimplifyRSqrt() { + x_ = IsWildcard(); + numerator_ = IsWildcard(); + auto sqrt = IsOp("sqrt"); + pattern_ = IsOp("divide")({numerator_, sqrt({x_})}); + } + + Expr Callback(const Expr& pre, const Expr& post, + const Map>& node_map) const override { + static const Op& op = Op::Get("rsqrt"); + auto x = node_map[x_][0]; + auto numerator = node_map[numerator_][0]; + return Call(Op::Get("multiply"), {numerator, Call(op, {x})}); + } + + private: + /*! \brief Pattern input */ + DFPattern x_; + DFPattern numerator_; +}; + Expr SimplifyExpr(const Expr& expr, const IRModule& mod) { // the rewrites will be applied in the given order, and repeated until fixed point DFPatternRewriteComposer composer; @@ -694,6 +717,7 @@ Expr SimplifyExpr(const Expr& expr, const IRModule& mod) { composer.AddRewrite(); composer.AddRewrite(); composer.AddRewrite(); + composer.AddRewrite(); composer.AddRewrite(); composer.AddRewrite(); composer.AddRewrite(); diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py index 162ac6e73ddb..837b15a48dc1 100644 --- a/tests/python/relay/test_pass_simplify_expr.py +++ b/tests/python/relay/test_pass_simplify_expr.py @@ -584,5 +584,24 @@ def expected(): assert tvm.ir.structural_equal(zzl, after) +def test_simplify_rsqrt(): + shape = (32, 1, 1) + x = relay.var("x", shape=shape, dtype="float32") + + def before(c): + return relay.const(c) / relay.sqrt(x) + + def expected(c): + if c == 1: + return relay.rsqrt(x) + else: + return relay.const(c) * relay.rsqrt(x) + + for c in [1.0, 2.0, 2.5]: + opt = run_opt_pass(before(c), transform.SimplifyExpr()) + after = run_opt_pass(expected(c), transform.InferType()) + assert tvm.ir.structural_equal(opt, after) + + if __name__ == "__main__": pytest.main([__file__]) From da5836f230525afe8984dcbfea8ee788a6286b5c Mon Sep 17 00:00:00 2001 From: Gavin Uberti Date: Tue, 23 Aug 2022 23:32:56 +0700 Subject: [PATCH 013/704] [AutoTVM] Add support for text buffers to ApplyHistoryBest (#12521) Currently, AutoTVM's ApplyHistoryBest class does not support loading tuning logs from memory. This is a pet peeve of mine, as it requires you to work with a tempfile whenever writing autotuning tests. This is also just strange, as the rest of AutoTVM has support for text buffers (e.g. tvm.autotvm.callback.log_to_file supports passing in a text buffer, letting us write to but not read from them). Additionally, ApplyHistoryBest handles input arguments very unintuitively. Before this PR, it allowed users to pass string filepaths, a list of string filepaths, or an Iterable (such as a list) of input and result tuples. However, it did not support taking in StringIO objects as mentioned above, nor pathlib.Path objects, nor combinations of a filepath and an Iterable of tuples. In a perfect world, we would change ApplyHistoryBest to take as input a path-like object, file-like object, or an Iterable of input and result tuples (similar to what ApplyGraphBest takes as an argument). However, this would break the existing functionality to take as input a list of filepaths. To be backwards compatible, while fixing this issue, this pull request defines a new type inside dispatcher.py: Records = Union[ Union[str, bytes, Path], # Path-like objects TextIOBase, # File-like objects Iterable[Tuple[MeasureInput, MeasureResult]], ] It then rewrites ApplyHistoryBest.load so it takes the following arguments: def load(self, records: Union[Records, Iterable[Records]]): This PR also adds unit tests for this new functionality, and fixes a relevant bug in tests/micro/common/test_autotune.py in which a StringIO object was passed to apply_history_best, causing it to appear to pass but not actually read any data. --- python/tvm/autotvm/record.py | 31 ++++++- python/tvm/autotvm/task/dispatcher.py | 87 ++++++++++-------- tests/micro/common/test_autotune.py | 1 + tests/python/unittest/test_autotvm_record.py | 92 ++++++++++++++++---- 4 files changed, 155 insertions(+), 56 deletions(-) diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py index b2faee243be0..8e54e011c0b7 100644 --- a/python/tvm/autotvm/record.py +++ b/python/tvm/autotvm/record.py @@ -20,10 +20,12 @@ import argparse import base64 +from io import TextIOBase import logging import pickle import json import time +from typing import Union import os import itertools from collections import OrderedDict @@ -194,20 +196,41 @@ def clean_json_to_python(x): raise RuntimeError("Invalid log protocol: " + protocol) -def load_from_file(filename): - """Generator: load records from file. +def load_from_buffer(file: TextIOBase): + """Generator: load records from buffer. This is a generator that yields the records. Parameters ---------- - filename: str + file: io.TextIOBase Yields ------ input: autotvm.measure.MeasureInput result: autotvm.measure.MeasureResult """ - with open(filename) as f: + for row in file: + if row and not row.startswith("#"): + ret = decode(row) + if ret is None: + continue + yield ret + + +def load_from_file(filepath: Union[str, bytes, os.PathLike]): + """Generator: load records from path. + This is a generator that yields the records. + + Parameters + ---------- + filepath: str, bytes, or os.PathLike + + Yields + ------ + input: autotvm.measure.MeasureInput + result: autotvm.measure.MeasureResult + """ + with open(filepath) as f: for row in f: if row and not row.startswith("#"): ret = decode(row) diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py index 11a608d4cbbf..8b2e7eb01fe6 100644 --- a/python/tvm/autotvm/task/dispatcher.py +++ b/python/tvm/autotvm/task/dispatcher.py @@ -30,18 +30,26 @@ from __future__ import absolute_import as _abs +from io import TextIOBase import logging -import typing -from typing import Union -from collections.abc import Iterable +from os import PathLike +from pathlib import Path +from typing import List, Iterable, Tuple, Union import numpy as np from .space import FallbackConfigEntity from .. import env as _env +from ..measure import MeasureInput, MeasureResult logger = logging.getLogger("autotvm") +Records = Union[ + Union[str, bytes, Path], # Path-like objects + TextIOBase, # File-like objects + Iterable[Tuple[MeasureInput, MeasureResult]], +] + class DispatchContext(object): """ @@ -194,7 +202,7 @@ class ApplyFixedConfig(DispatchContext): Name of schedules to use. """ - def __init__(self, tasks, schedule_names: Union[str, typing.List[str]]): + def __init__(self, tasks, schedule_names: Union[str, List[str]]): super(ApplyFixedConfig, self).__init__() if isinstance(schedule_names, str): self._schedule_names = list(schedule_names) @@ -238,15 +246,15 @@ class ApplyHistoryBest(DispatchContext): Parameters ---------- - records : str, list of str, or iterator of (autotvm.measure.MeasureInput,\ - autotvm.measure.MeasureResult) - Collection of tuning records. - If is str, then it should be the filename of a records log file. - Each row of this file is an encoded record pair. If it is a list, it can either be - a list of paths to log files that will be loaded jointly or an iterator or records. + records : None, Records, or iterator of Records objects, where a + Records object is a path-like object, a file-like object, + or an iterator of (MeasureInput, MeasureResult). + + Collection of tuning records. If multiple Records objects are passed, their + contents will be merged. """ - def __init__(self, records): + def __init__(self, records: Union[None, Records, Iterable[Records]]): super(ApplyHistoryBest, self).__init__() self.best_by_targetkey = {} @@ -256,46 +264,48 @@ def __init__(self, records): if records: self.load(records) - def load(self, records): + def load(self, records: Union[Records, Iterable[Records]]): """Load records to this dispatch context Parameters ---------- records : str, list of str, or iterator of (autotvm.measure.MeasureInput,\ autotvm.measure.MeasureResult) - Collection of tuning records. - If is str, then it should be the filename of a records log file. - Each row of this file is an encoded record pair. If it is a list - it can either be a list of paths to logs that will be loaded jointly or - an iterator of measurement results. + + Collection of tuning records. If multiple Records objects are passed, their + contents will be merged. """ # pylint: disable=import-outside-toplevel - from pathlib import Path - from ..record import load_from_file + from ..record import load_from_file, load_from_buffer - joint_records = [] - if not isinstance(records, Iterable) or isinstance(records, str): - records = [records] + def _unpack_records( + records: Union[Records, Iterable[Records]] + ) -> List[Tuple[MeasureInput, MeasureResult]]: - for rec in records: - if isinstance(rec, Path): - rec = str(rec) + if isinstance(records, (str, bytes, PathLike)): + return load_from_file(records) - if isinstance(rec, str): - rec = load_from_file(rec) - joint_records += rec - else: - if rec is not None: - joint_records.append(rec) + if isinstance(records, TextIOBase): + return load_from_buffer(records) - if not joint_records: + joint_records = [] + for record in records: + if isinstance(record, Tuple) and isinstance(record[0], MeasureInput): + joint_records.append(record) + else: + joint_records += _unpack_records(record) + + return joint_records + + flattened_records = _unpack_records(records) + if not flattened_records: return best_by_targetkey = self.best_by_targetkey best_by_model = self.best_by_model counter = 0 - for inp, res in joint_records: + for inp, res in flattened_records: counter += 1 if res.error_no != 0: continue @@ -447,7 +457,7 @@ class ApplyGraphBest(DispatchContext): node index. """ - def __init__(self, records): + def __init__(self, records: Records): """ Parameters ---------- @@ -458,11 +468,16 @@ def __init__(self, records): Otherwise, it is an iterator. """ # pylint: disable=import-outside-toplevel - from ..record import load_from_file + from ..record import load_from_file, load_from_buffer super(ApplyGraphBest, self).__init__() - if isinstance(records, str): + if isinstance(records, (str, bytes, PathLike)): records = load_from_file(records) + elif isinstance(records, TextIOBase): + records = load_from_buffer(records) + else: + records = list(records) + self._records = list(records) self._counter = 0 self._global_cfg_dict = {} diff --git a/tests/micro/common/test_autotune.py b/tests/micro/common/test_autotune.py index b79260dd46ed..46f6d8889a9a 100644 --- a/tests/micro/common/test_autotune.py +++ b/tests/micro/common/test_autotune.py @@ -61,6 +61,7 @@ def test_kws_autotune_workflow(platform, board, tmp_path): assert logs[0]["config"]["entity"] != logs[1]["config"]["entity"] # Compile the best model with AOT and connect to it + str_io_logs.seek(0) with tvm.micro.testing.create_aot_session( platform, board, diff --git a/tests/python/unittest/test_autotvm_record.py b/tests/python/unittest/test_autotvm_record.py index 147122ff10d6..693810d3f979 100644 --- a/tests/python/unittest/test_autotvm_record.py +++ b/tests/python/unittest/test_autotvm_record.py @@ -15,10 +15,10 @@ # specific language governing permissions and limitations # under the License. """test the correctness of dump and load of data log""" +from io import StringIO +from os import PathLike import time -import tvm -from tvm import te from tvm.contrib import utils from tvm import autotvm @@ -78,23 +78,83 @@ def test_file_io(): assert str(x) == str(inputs[0][2]) -def test_apply_history_best(): +def test_apply_history_best(tmpdir): tsk, target = get_sample_task() + best = str(tsk.config_space.get(2)) - records = [ - (MeasureInput(target, tsk, tsk.config_space.get(0)), MeasureResult((0.1,), 0, 2.3, 0)), - (MeasureInput(target, tsk, tsk.config_space.get(1)), MeasureResult((0.3,), 0, 2.3, 0)), - (MeasureInput(target, tsk, tsk.config_space.get(2)), MeasureResult((0.01,), 0, 2.3, 0)), - (MeasureInput(target, tsk, tsk.config_space.get(4)), MeasureResult((0.4,), 0, 2.3, 0)), - ] - hist_best = ApplyHistoryBest(records) - x = hist_best.query(target, tsk.workload) - assert str(x) == str(tsk.config_space.get(2)) + inputs_batch_1 = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(3)] + results_batch_1 = [MeasureResult((i,), 0, 0, 0) for i in range(1, 3)] + results_batch_1.append(MeasureResult((0.5,), 0, 2.3, 0)) - # Confirm same functionality for iterators. - hist_best = ApplyHistoryBest(iter(records)) - x = hist_best.query(target, tsk.workload) - assert str(x) == str(tsk.config_space.get(2)) + # Write data out to file + filepath_batch_1 = tmpdir / "batch_1.log" + with open(filepath_batch_1, "w") as file: + autotvm.callback.log_to_file(file)(None, inputs_batch_1, results_batch_1) + + # Load best results from Path + assert isinstance(filepath_batch_1, PathLike) + hist_best = ApplyHistoryBest(filepath_batch_1) + assert str(hist_best.query(target, tsk.workload)) == best + + # Load best results from str(Path) + hist_best = ApplyHistoryBest(str(filepath_batch_1)) + assert str(hist_best.query(target, tsk.workload)) == best + + # Write data into StringIO buffer + stringio_batch_1 = StringIO() + assert isinstance(filepath_batch_1, PathLike) + callback = autotvm.callback.log_to_file(stringio_batch_1) + callback(None, inputs_batch_1, results_batch_1) + stringio_batch_1.seek(0) + + # Load best results from strIO + hist_best = ApplyHistoryBest(stringio_batch_1) + assert str(hist_best.query(target, tsk.workload)) == best + + # Load best result from list of tuples (MeasureInput, MeasureResult) + hist_best = ApplyHistoryBest(list(zip(inputs_batch_1, results_batch_1))) + assert str(hist_best.query(target, tsk.workload)) == best + + # Same thing, but iterable instead of list (i.e. no subscripting) + hist_best = ApplyHistoryBest(zip(inputs_batch_1, results_batch_1)) + assert str(hist_best.query(target, tsk.workload)) == best + + +def test_apply_history_best_multiple_batches(tmpdir): + tsk, target = get_sample_task() + best = str(tsk.config_space.get(2)) + + inputs_batch_1 = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(2)] + results_batch_1 = [MeasureResult((i,), 0, 0, 0) for i in range(1, 3)] + filepath_batch_1 = tmpdir / "batch_1.log" + with open(filepath_batch_1, "w") as file: + autotvm.callback.log_to_file(file)(None, inputs_batch_1, results_batch_1) + + inputs_batch_2 = [MeasureInput(target, tsk, tsk.config_space.get(i)) for i in range(2, 4)] + results_batch_2 = [MeasureResult((0.5,), 0, 0, 0), MeasureResult((3,), 0, 0, 0)] + filepath_batch_2 = tmpdir / "batch_2.log" + with open(filepath_batch_2, "w") as file: + autotvm.callback.log_to_file(file)(None, inputs_batch_2, results_batch_2) + + # Check two Path filepaths works + hist_best = ApplyHistoryBest([filepath_batch_1, filepath_batch_2]) + assert str(hist_best.query(target, tsk.workload)) == best + + # Check that an arbitrary Iterable of Paths works + # Calling zip() on a single list gives a non-subscriptable Iterable + hist_best = ApplyHistoryBest(zip([filepath_batch_1, filepath_batch_2])) + assert str(hist_best.query(target, tsk.workload)) == best + + # Check that Iterable of Iterable of tuples is correctly merged + hist_best = ApplyHistoryBest( + zip( + [ + zip(inputs_batch_1, results_batch_1), + zip(inputs_batch_2, results_batch_2), + ] + ) + ) + assert str(hist_best.query(target, tsk.workload)) == best if __name__ == "__main__": From 1d71c1b4aad72843540f897e27c01aa73256a463 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Tue, 23 Aug 2022 11:08:55 -0700 Subject: [PATCH 014/704] [skip ci][ci] Mark more ethosu tests with xfail (#12560) See #12511 for context. Since more parameterizations are popping up as failed, this disables whole tests rather than specific combinations of parameters. --- .../python/contrib/test_ethosu/test_codegen.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py index 5b4643edb4a0..ae7d0821bb7f 100644 --- a/tests/python/contrib/test_ethosu/test_codegen.py +++ b/tests/python/contrib/test_ethosu/test_codegen.py @@ -347,9 +347,7 @@ def binary_elementwise(lhs, rhs): ([1, 4, 4], [4, 1]), ], ) -@tvm.testing.xfail_parameterizations( - "ifm_shape0-ifm2_shape0-ethos-u55-64", reason="See https://github.com/apache/tvm/issues/12511" -) +@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511") def test_binary_add_with_non_4d_shapes( request, accel_type, @@ -608,9 +606,7 @@ def rounding_right_shift(lhs, rhs): @pytest.mark.parametrize("accel_type", ACCEL_TYPES) @pytest.mark.parametrize("ifm_shape", [(3, 2), (1, 15, 11, 7), (3, 1, 12), (400,)]) @pytest.mark.parametrize("ifm_scale, ifm_zp, ofm_scale, ofm_zp", [(1, 0, 1, 0), (0.015, 3, 0.2, 5)]) -@tvm.testing.xfail_parameterizations( - "1-0-1-0-ifm_shape3-ethos-u55-128", reason="See https://github.com/apache/tvm/issues/12511" -) +@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511") def test_ethosu_identity_codegen( request, ifm_shape, ifm_scale, ifm_zp, ofm_scale, ofm_zp, accel_type ): @@ -659,6 +655,7 @@ def generate_output_data(input_data): ((8, 7, 3), (-4, 1, 8, -2)), ], ) +@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511") def test_relay_reshape_codegen(ifm_shape, new_shape, accel_type): np.random.seed(0) @@ -691,9 +688,7 @@ def create_model(): ([5000], [123], [2151]), ], ) -@tvm.testing.xfail_parameterizations( - "ifm_shape3-begin3-size3-ethos-u55-32", reason="See https://github.com/apache/tvm/issues/12511" -) +@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511") def test_tflite_slice(request, accel_type, ifm_shape, begin, size): np.random.seed(0) @@ -729,9 +724,7 @@ def strided_slice_func(x): "ifm_shape", [[1, 5, 12, 4], [1, 1, 2], [4, 3, 2], [10, 20], [345]], ) -@tvm.testing.xfail_parameterizations( - "ifm_shape4-ABS-ethos-u55-64", reason="See https://github.com/apache/tvm/issues/12511" -) +@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511") def test_ethosu_unary_elementwise( request, accel_type, From 99b9b74b12d8687966c8d009a9a0cfca6f36defc Mon Sep 17 00:00:00 2001 From: Elen Kalda Date: Tue, 23 Aug 2022 20:49:52 +0100 Subject: [PATCH 015/704] [CI] Remove Vela from ci_cpu (#12533) While the dependencies for microNPU and CMSIS-NN moved into ci_cortexm, Vela is still installed in ci_cpu. As a result, we have some of the microNPU tests outside of test_ethosu folder failing since they use precence of Vela to decide whether to skip the test. This change will * remove Vela from ci_cpu * remove unnecessary PATH update --- docker/Dockerfile.ci_cpu | 7 ------- 1 file changed, 7 deletions(-) diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu index 013ebfb59e88..3812bfbd197e 100644 --- a/docker/Dockerfile.ci_cpu +++ b/docker/Dockerfile.ci_cpu @@ -124,13 +124,6 @@ RUN bash /install/ubuntu_install_androidsdk.sh ENV ANDROID_HOME=/opt/android-sdk-linux/ ENV ANDROID_NDK_HOME=/opt/android-sdk-linux/ndk/21.3.6528147/ -# Install Vela compiler -COPY install/ubuntu_install_vela.sh /install/ubuntu_install_vela.sh -RUN bash /install/ubuntu_install_vela.sh - -# Update PATH -ENV PATH /opt/arm/gcc-arm-none-eabi/bin:/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4:$PATH - # PaddlePaddle deps COPY install/ubuntu_install_paddle.sh /install/ubuntu_install_paddle.sh RUN bash /install/ubuntu_install_paddle.sh From 4d104e5ec6b02d0b1d08c93c26bc322f54189cba Mon Sep 17 00:00:00 2001 From: Nicola Lancellotti Date: Tue, 23 Aug 2022 20:57:24 +0100 Subject: [PATCH 016/704] [ETHOSN] Add support for special indices of Reshape (#12556) This pr adds support for the special indices values of the reshape operator for the Arm(R) Ethos(TM)-N NPU. --- .../backend/contrib/ethosn/ethosn_api.cc | 38 +++---------------- .../contrib/test_ethosn/test_reshape.py | 37 ++++-------------- 2 files changed, 12 insertions(+), 63 deletions(-) diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc index 55d0b57bcc2f..c1f67d0d2b16 100644 --- a/src/relay/backend/contrib/ethosn/ethosn_api.cc +++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc @@ -36,6 +36,7 @@ #include #include +#include "../../../op/tensor/transform.h" #include "ethosn_support_library/Support.hpp" #include "ethosn_support_library/SupportQueries.hpp" #include "tvm/relay/qnn/attrs.h" @@ -293,12 +294,6 @@ EthosnError EthosnAPI::Reshape(const Expr& expr, ReshapeParams* params) { // Create input info Call reshape = Downcast(expr); const auto* input_dtype = reshape->args[0]->checked_type().as(); - const auto& reshape_attrs = reshape->attrs.as(); - - if (reshape_attrs->newshape.size() > params->new_shape.size()) { - return EthosnError(ErrStrm() << "reshape dimension=" << reshape_attrs->newshape.size() - << ", reshape dimension must be <= " << params->new_shape.size()); - } sl::TensorShape input_tensor_shape = {1, 1, 1, 1}; sl::DataType input_data_type; @@ -309,35 +304,12 @@ EthosnError EthosnAPI::Reshape(const Expr& expr, ReshapeParams* params) { tensor_size *= dim; } - int infer_index = -1; - int reshaped_size = 1; - Array inferred_shape = {1, 1, 1, 1}; - for (size_t i = 0; i < reshape_attrs->newshape.size(); i++) { - int value = reshape_attrs->newshape[i].as()->value; - if (value < -1) { - return EthosnError(ErrStrm() - << "reshape dimension=" << value << ", reshape dimension must be >= -1"); - } - if (value == -1) { - if (infer_index != -1) { - return EthosnError("only one reshape dimension can be inferred"); - } - infer_index = i; - } else { - inferred_shape.Set(i, value); - reshaped_size *= value; - } + Array inferred_shape = {1, 1, 1, 1}; + Array new_shape = InferNewShape(input_dtype->shape, reshape->attrs, false); + for (size_t i = 0; i < new_shape.size(); ++i) { + inferred_shape.Set(i, new_shape[i]); } - if (infer_index != -1) { - if (tensor_size % reshaped_size != 0) { - return EthosnError(ErrStrm() - << "reshaped size=" << reshaped_size - << ", must be an integer factor of the input size " << tensor_size); - } - int value = tensor_size / reshaped_size; - inferred_shape.Set(infer_index, Integer(value)); - } err += Tvm2Npu(inferred_shape, ¶ms->new_shape); params->input_info = sl::TensorInfo(input_tensor_shape, input_data_type, params->input_info.m_DataFormat, diff --git a/tests/python/contrib/test_ethosn/test_reshape.py b/tests/python/contrib/test_ethosn/test_reshape.py index 6266367e90cc..cb8a49be2d81 100644 --- a/tests/python/contrib/test_ethosn/test_reshape.py +++ b/tests/python/contrib/test_ethosn/test_reshape.py @@ -20,7 +20,6 @@ import tvm from tvm import relay from tvm.testing import requires_ethosn -from tvm.relay.op.contrib import get_pattern_table import numpy as np import pytest from . import infrastructure as tei @@ -43,7 +42,14 @@ def _get_model(input_shape, output_shape, dtype): ((1, 15, 4, 1), (1, 30, 2)), ((1, 15, 4, 1), (1, 4, 15, 1)), ((1, 15, 4, 1), (1, 12, 5, 1)), + ((1, 15, 4, 1), (1, 0, 2, 2)), ((1, 15, 4, 1), (1, -1, 2, 1)), + ((1, 15, 4, 1), (1, -2)), + ((1, 15, 4, 1), (1, -3, 1, 1)), + ((1, 15, 4, 1), (1, -4, 3, 5, 4)), + ((1, 15, 4, 1), (0, -1, -2)), + ((1, 15, 4, 1), (0, -1, -3, 1)), + ((1, 15, 4, 1), (1, -4, -1, 5, 4)), ], ) def test_reshape(dtype, input_shape, output_shape): @@ -65,32 +71,3 @@ def test_reshape(dtype, input_shape, output_shape): outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu)) tei.verify(outputs, dtype, 1) - - -@requires_ethosn -@pytest.mark.parametrize( - "input_shape, output_shape, dtype, err_msg", - [ - ( - (1, 15, 4, 1), - (1, 15, -2), - "uint8", - "reshape dimension=-2, reshape dimension must be >= -1", - ), - ( - (1, 1, 4, 1), - (1, 1, 2, 2, 1), - "uint8", - "reshape dimension=5, reshape dimension must be <= 4", - ), - ], -) -def test_reshape_failure(input_shape, output_shape, dtype, err_msg): - np.random.seed(0) - model, params = _get_model(input_shape, output_shape, dtype) - mod = tei.make_module(model, params) - pattern = get_pattern_table("ethos-n") - mod = tei.make_module(model, params) - mod = relay.transform.MergeComposite(pattern)(mod) - mod = tei.make_ethosn_partition(mod["main"].body) - tei.test_error(mod, {}, err_msg) From 8c23469e2098659cffcbfebb56b4d32c0df7a6ed Mon Sep 17 00:00:00 2001 From: Mohamad Katanbaf Date: Tue, 23 Aug 2022 13:10:50 -0700 Subject: [PATCH 017/704] [MicroTVM] add heap-size to project options (#12390) * heap-size is added to project options * change stm32l4r5zi recommended heap size * change stm32l4r5zi recommended heap size * addressing comments * addressing comments * addressing comments Co-authored-by: Mohamad --- .../zephyr/template_project/boards.json | 3 +- .../template_project/microtvm_api_server.py | 45 +++++++++++++++++++ .../template_project/src/host_driven/main.c | 2 +- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/apps/microtvm/zephyr/template_project/boards.json b/apps/microtvm/zephyr/template_project/boards.json index aae764a8239e..dcca9c800224 100644 --- a/apps/microtvm/zephyr/template_project/boards.json +++ b/apps/microtvm/zephyr/template_project/boards.json @@ -54,7 +54,8 @@ "is_qemu": false, "fpu": true, "vid_hex": "0483", - "pid_hex": "374b" + "pid_hex": "374b", + "recommended_heap_size_bytes": 512000 }, "qemu_cortex_r5": { "board": "qemu_cortex_r5", diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py index eb20c3e88448..38a7ec0c2939 100644 --- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py +++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py @@ -195,6 +195,33 @@ def _get_device_args(options): ) +def _get_board_mem_size_bytes(options): + board_file_path = ( + pathlib.Path(get_zephyr_base(options)) + / "boards" + / "arm" + / options["zephyr_board"] + / (options["zephyr_board"] + ".yaml") + ) + try: + with open(board_file_path) as f: + board_data = yaml.load(f, Loader=yaml.FullLoader) + return int(board_data["ram"]) * 1024 + except: + _LOG.warning("Board memory information is not available.") + return None + + +DEFAULT_HEAP_SIZE_BYTES = 216 * 1024 + + +def _get_recommended_heap_size_bytes(options): + prop = BOARD_PROPERTIES[options["zephyr_board"]] + if "recommended_heap_size_bytes" in prop: + return prop["recommended_heap_size_bytes"] + return DEFAULT_HEAP_SIZE_BYTES + + def generic_find_serial_port(serial_number=None): """Find a USB serial port based on its serial number or its VID:PID. @@ -370,6 +397,12 @@ def _get_nrf_device_args(options): type="bool", help="Run on the FVP emulator instead of hardware.", ), + server.ProjectOption( + "heap_size_bytes", + optional=["generate_project"], + type="int", + help="Sets the value for HEAP_SIZE_BYTES passed to K_HEAP_DEFINE() to service TVM memory allocation requests.", + ), ] @@ -595,6 +628,18 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec cmake_f.write(line) + heap_size = _get_recommended_heap_size_bytes(options) + if options.get("heap_size_bytes"): + board_mem_size = _get_board_mem_size_bytes(options) + heap_size = options["heap_size_bytes"] + if board_mem_size is not None: + assert ( + heap_size < board_mem_size + ), f"Heap size {heap_size} is larger than memory size {board_mem_size} on this board." + cmake_f.write( + f"target_compile_definitions(app PUBLIC -DHEAP_SIZE_BYTES={heap_size})\n" + ) + if options.get("compile_definitions"): flags = options.get("compile_definitions") for item in flags: diff --git a/apps/microtvm/zephyr/template_project/src/host_driven/main.c b/apps/microtvm/zephyr/template_project/src/host_driven/main.c index c0286dc0c74f..7dd082e2e588 100644 --- a/apps/microtvm/zephyr/template_project/src/host_driven/main.c +++ b/apps/microtvm/zephyr/template_project/src/host_driven/main.c @@ -142,7 +142,7 @@ tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) { } // Heap for use by TVMPlatformMemoryAllocate. -K_HEAP_DEFINE(tvm_heap, 216 * 1024); +K_HEAP_DEFINE(tvm_heap, HEAP_SIZE_BYTES); // Called by TVM to allocate memory. tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) { From 13ebbfb37f8cec1da71d88fbcbecdd4ad4d24dcc Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Tue, 23 Aug 2022 15:44:34 -0500 Subject: [PATCH 018/704] Replace std::result_of (deprecated in C++17) with std::invoke_result, NFC (#12562) --- include/tvm/script/printer/traced_object.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/tvm/script/printer/traced_object.h b/include/tvm/script/printer/traced_object.h index 4c09b0a41b79..cb63c31cd4a5 100644 --- a/include/tvm/script/printer/traced_object.h +++ b/include/tvm/script/printer/traced_object.h @@ -450,7 +450,7 @@ class TracedBasicValue { * \brief Transform the wrapped value without changing its path. */ template - typename detail::TracedObjectWrapperSelector::type>::Type + typename detail::TracedObjectWrapperSelector::type>::Type ApplyFunc(F&& f) const { return MakeTraced(f(value_), path_); } From 8174d082e8168db9ad63826c9d68aee8c76c7090 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Tue, 23 Aug 2022 16:42:43 -0500 Subject: [PATCH 019/704] Add using directives for otherwise hidden virtual functions, NFC (#12561) This silences warning ``` warning: 'foo' hides overloaded virtual functions [-Woverloaded-virtual] ``` typically caused by overriding only some overloads of `VisitExpr_` from a set defined in the base class. --- src/relay/backend/annotate_used_memory.cc | 2 +- src/relay/transforms/annotate_texture_storage.cc | 4 ++++ src/relay/transforms/compiler_function_utils.cc | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/relay/backend/annotate_used_memory.cc b/src/relay/backend/annotate_used_memory.cc index ad370c73ad1e..4dcdb2e541c5 100644 --- a/src/relay/backend/annotate_used_memory.cc +++ b/src/relay/backend/annotate_used_memory.cc @@ -110,7 +110,7 @@ class AnnotateUsedMemoryMutator : public transform::DeviceAwareExprMutator { /*! * \brief Establish which let bindings have primitive function values. */ - std::pair PreVisitLetBinding_(const Var& var, const Expr& value) { + std::pair PreVisitLetBinding_(const Var& var, const Expr& value) override { if (const auto* func_node = value.as()) { ICHECK(func_node->attrs.HasNonzeroAttr(attr::kPrimitive)) << "Expect top-level functions to be primitive."; diff --git a/src/relay/transforms/annotate_texture_storage.cc b/src/relay/transforms/annotate_texture_storage.cc index b3ed28db4574..c9cf45e06929 100644 --- a/src/relay/transforms/annotate_texture_storage.cc +++ b/src/relay/transforms/annotate_texture_storage.cc @@ -117,6 +117,8 @@ class StorageInfo : private transform::DeviceAwareExprVisitor { } private: + using transform::DeviceAwareExprVisitor::VisitExpr_; + void Visit(const Expr& expr) { // Pre-order traversal to enable upward propagation // of consumer storage scopes to producers when desirable. @@ -426,6 +428,8 @@ class RewriteVDStorageScopes : public transform::DeviceAwareExprMutator { using VarMap = std::unordered_map; public: + using transform::DeviceAwareExprMutator::VisitExpr_; + explicit RewriteVDStorageScopes(const Map>>& storage_scope) : transform::DeviceAwareExprMutator(Optional()), storage_scope_(storage_scope) {} diff --git a/src/relay/transforms/compiler_function_utils.cc b/src/relay/transforms/compiler_function_utils.cc index 1dafcd10a361..f1e7e223541b 100644 --- a/src/relay/transforms/compiler_function_utils.cc +++ b/src/relay/transforms/compiler_function_utils.cc @@ -54,6 +54,8 @@ const FunctionNode* AsFunctionNode(const Expr& expr, const std::string& compiler */ class Outliner : public MixedModeMutator { public: + using MixedModeMutator::VisitExpr_; + Outliner(GlobalSymbolCache* cache, std::string compiler_filter, IRModule mod) : cache_(cache), compiler_filter_(std::move(compiler_filter)), mod_(std::move(mod)) {} From c15cc5ef6d36288abd58587b7bf4f0440596a54f Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Tue, 23 Aug 2022 21:02:27 -0700 Subject: [PATCH 020/704] [Target] Remove deprecated parameters from target (#12416) * remove depricated parameters in target * lint * fix cpp tests fix * remove more configs in test files * address comments * fix error * fix hexagon * fix micro tutorial * fix integration tests * fix hexagon * lint * fix unittest * fix readme * fix assert executor in target * address comments * fix tutorials * fix hexagon target * fix tutorial * fix for tutorials * hexagon --- apps/hexagon_launcher/README.md | 4 +- apps/howto_deploy/prepare_test_libs.py | 2 +- apps/sgx/src/build_model.py | 7 +- .../wasm-graph/tools/build_graph_lib.py | 9 ++- .../ci_logs/matmul.json | 2 +- .../ci_logs/resnet-50-NHWC-B1-llvm.json | 52 ++++++------- .../ci_logs/sparse_dense.json | 2 +- .../tune_with_autotvm/tune_relay_x86.py | 4 +- .../how_to/work_with_microtvm/micro_tvmc.sh | 4 +- gallery/tutorial/auto_scheduler_matmul_x86.py | 2 - python/tvm/contrib/hexagon/pytest_plugin.py | 2 +- python/tvm/relay/build_module.py | 78 ------------------- python/tvm/target/target.py | 19 +---- src/target/target_kind.cc | 38 ++------- tests/cpp/c_codegen_test.cc | 10 +-- tests/cpp/target_test.cc | 4 +- .../test_hexagon/topi/test_softmax_slice.py | 1 - tests/python/driver/tvmc/test_target.py | 6 +- .../python/driver/tvmc/test_target_options.py | 2 +- tests/python/relay/aot/test_cpp_aot.py | 2 +- tests/python/relay/aot/test_crt_aot.py | 36 --------- tests/python/relay/test_build_module.py | 47 +++-------- .../test_tir_transform_common_subexpr_elim.py | 2 +- .../unittest/test_tvmscript_roundtrip.py | 4 +- tests/scripts/task_python_docs.sh | 2 +- 25 files changed, 80 insertions(+), 261 deletions(-) diff --git a/apps/hexagon_launcher/README.md b/apps/hexagon_launcher/README.md index 210759a80c7c..cc433f245759 100644 --- a/apps/hexagon_launcher/README.md +++ b/apps/hexagon_launcher/README.md @@ -118,7 +118,7 @@ mod, params = relay.frontend.from_tflite( tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict ) -target = tvm.target.hexagon('v68', link_params=True) +target = tvm.target.hexagon('v68') with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, tvm.target.Target(target, host=target), params=params, mod_name="default") @@ -172,7 +172,7 @@ A sample output JSON from running the Inception V3 model may look like When using AoT, the `target` needs to be `llvm`: ``` -aot_target = "llvm -keys=hexagon -link-params=0 -mattr=+hvxv69,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp -mcpu=hexagonv69 -mtriple=hexagon" +aot_target = "llvm -keys=hexagon -mattr=+hvxv69,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp -mcpu=hexagonv69 -mtriple=hexagon" aot_host_target = aot_target ``` diff --git a/apps/howto_deploy/prepare_test_libs.py b/apps/howto_deploy/prepare_test_libs.py index a6c7688d2084..8e9f8b5f7335 100644 --- a/apps/howto_deploy/prepare_test_libs.py +++ b/apps/howto_deploy/prepare_test_libs.py @@ -33,7 +33,7 @@ def prepare_test_libs(base_path): fadd_dylib.export_library(dylib_path) # Compile library in system library mode - fadd_syslib = tvm.build(s, [A, B], "llvm --system-lib", name="addonesys") + fadd_syslib = tvm.build(s, [A, B], "llvm", name="addonesys") syslib_path = os.path.join(base_path, "test_addone_sys.o") fadd_syslib.save(syslib_path) diff --git a/apps/sgx/src/build_model.py b/apps/sgx/src/build_model.py index 1fc297d8a094..ea3b4ed992ad 100755 --- a/apps/sgx/src/build_model.py +++ b/apps/sgx/src/build_model.py @@ -39,7 +39,12 @@ def main(): ) with tvm.transform.PassContext(opt_level=3): - graph, lib, params = relay.build(net, "llvm --system-lib", params=params) + graph, lib, params = relay.build( + net, + "llvm", + params=params, + runtime=tvm.relay.backend.Runtime("cpp", {"system-lib": True}), + ) build_dir = osp.abspath(sys.argv[1]) if not osp.isdir(build_dir): diff --git a/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py b/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py index 9b262c398e00..c2f9089710a3 100755 --- a/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py +++ b/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py @@ -72,10 +72,15 @@ def build_graph_lib(opt_level): shape_dict = {input_name: img_data.shape} mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) - target = "llvm -mtriple=wasm32-unknown-unknown -mattr=+simd128 --system-lib" + target = "llvm -mtriple=wasm32-unknown-unknown -mattr=+simd128" with tvm.transform.PassContext(opt_level=opt_level): - factory = relay.build(mod, target=target, params=params) + factory = relay.build( + mod, + target=target, + params=params, + runtime=tvm.relay.backend.Runtime("cpp", {"system-lib": True}), + ) # Save the model artifacts to obj_file obj_file = os.path.join(out_dir, "graph.o") diff --git a/gallery/how_to/tune_with_autoscheduler/ci_logs/matmul.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/matmul.json index 2e3a98404dda..b0d33a911a63 100644 --- a/gallery/how_to/tune_with_autoscheduler/ci_logs/matmul.json +++ b/gallery/how_to/tune_with_autoscheduler/ci_logs/matmul.json @@ -1,2 +1,2 @@ # Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI. -{"i": [["[\"matmul_add\", 1024, 1024, 1024, \"float32\"]", "llvm -keys=cpu -link-params=0", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 2, 0, 1024, [2, 1, 4], 1], ["SP", 2, 4, 1024, [1, 1, 8], 1], ["SP", 2, 8, 1024, [4], 1], ["RE", 2, [0, 4, 1, 5, 8, 2, 6, 9, 3, 7]], ["FSP", 4, 0, 0, 2], ["FSP", 4, 3, 1, 2], ["RE", 4, [0, 3, 1, 4, 2, 5]], ["CA", 2, 4, 3], ["FU", 4, [0, 1]], ["AN", 4, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$8"], ["AN", 2, 9, 2], ["AN", 4, 4, 2]]]], "r": [[0.0044742], 0, 0.335558, 1607112214], "v": "v0.3"} +{"i": [["[\"matmul_add\", 1024, 1024, 1024, \"float32\"]", "llvm -keys=cpu", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 2, 0, 1024, [2, 1, 4], 1], ["SP", 2, 4, 1024, [1, 1, 8], 1], ["SP", 2, 8, 1024, [4], 1], ["RE", 2, [0, 4, 1, 5, 8, 2, 6, 9, 3, 7]], ["FSP", 4, 0, 0, 2], ["FSP", 4, 3, 1, 2], ["RE", 4, [0, 3, 1, 4, 2, 5]], ["CA", 2, 4, 3], ["FU", 4, [0, 1]], ["AN", 4, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$8"], ["AN", 2, 9, 2], ["AN", 4, 4, 2]]]], "r": [[0.0044742], 0, 0.335558, 1607112214], "v": "v0.3"} diff --git a/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json index 3dd4541fd33a..4fb148c887bd 100644 --- a/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json +++ b/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json @@ -1,28 +1,28 @@ # Provide valid schedules for resnet-50 for CPU. # This is used to run the tutorial on the documentation web server. -{"i": [["[\"d7b65649a4dd54becea0a52aabbc5af5\", 1, 1000, 1, 1000]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 1, 1000, [50], 1], ["RF", 3, 2, 1], ["RE", 3, [0, 2, 1]], ["SP", 1, 1, 1000, [20], 1], ["RF", 1, 2, 1], ["RE", 1, [0, 2, 1]], ["CR", 6], ["CA", 5, 6, 1], ["CR", 4], ["CA", 2, 3, 1], ["AN", 1, 0, 3], ["FU", 3, [0, 1]], ["AN", 3, 0, 3], ["AN", 4, 0, 3], ["FU", 6, [0, 1]], ["AN", 6, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[8.75e-06, 1.0781e-05, 9.875e-06, 9.836e-06, 1.0357e-05, 1.0238e-05, 1.0341e-05, 9.75e-06, 9.561e-06, 1.0122e-05], 0, 0.17921, 1606960872], "v": "v0.5"} -{"i": [["[\"69115f188984ae34ede37c3b8ca40b43\", 1, 7, 7, 2048, 1, 1, 1, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CA", 1, 2, 3], ["FU", 2, [0, 1, 2, 3]], ["AN", 2, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"]]]], "r": [[6.28e-06, 8.176e-06, 8.048e-06, 7.942e-06, 7.977e-06, 8.002e-06, 8.093e-06, 7.924e-06, 7.943e-06, 7.924e-06], 0, 0.130759, 1606960900], "v": "v0.5"} -{"i": [["[\"875556d12d0be2269206a7775d5296a6\", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 1, 1, 2048, 1, 1, 1, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 9], ["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 2048, [8, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 10, 0, 3, 2], ["FSP", 10, 3, 4, 2], ["FSP", 10, 6, 5, 2], ["FSP", 10, 9, 6, 2], ["RE", 10, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 10, 7], ["CI", 1], ["FU", 10, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 10, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 10, 4, 2]]]], "r": [[0.000175984, 0.000171372, 0.00018538, 0.000178085, 0.00017879, 0.000179878, 0.000179221, 0.000178598, 0.000176714, 0.000168318], 0, 0.277929, 1606960917], "v": "v0.5"} -{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 14, 14, 1024, 1, 1, 1024, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 7], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 512, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 14, 2]]]], "r": [[0.00012651, 0.00012801, 0.000128605, 0.00013267, 0.00012596, 0.000126418, 0.000121995, 0.000127242, 0.000128152, 0.000129989], 0, 0.310011, 1606960986], "v": "v0.5"} -{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 1, 1, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 1024, [1, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6]], ["AN", 3, 0, 3], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000183629, 0.000188334, 0.000195553, 0.000187308, 0.000196409, 0.000190496, 0.000190344, 0.000188567, 0.000186319, 0.000187136], 0, 0.384722, 1606961002], "v": "v0.5"} -{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 28, 28, 512, 1, 1, 512, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [2, 1, 7], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 256, [16, 4, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["CR", 6], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 14, 2]]]], "r": [[0.000118033, 0.000116806, 0.000134047, 0.000116701, 0.000116219, 0.000116834, 0.000117132, 0.000117029, 0.000116393, 0.000116778], 0, 0.31025, 1606961069], "v": "v0.5"} -{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 1, 1, 512, 1, 28, 28, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 1], 1], ["SP", 3, 8, 28, [7, 1, 1], 1], ["SP", 3, 12, 512, [1, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.00019554, 0.000203491, 0.000199599, 0.000194289, 0.000197556, 0.000199504, 0.000198527, 0.000200656, 0.000200037, 0.000201954], 0, 0.240599, 1606961080], "v": "v0.5"} -{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 56, 56, 256, 1, 1, 256, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [14, 1, 2], 1], ["SP", 3, 8, 28, [2, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [16], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000128461, 0.000158344, 0.000154659, 0.000148478, 0.000162668, 0.000155789, 0.000149412, 0.000141607, 0.000148815, 0.000165989], 0, 0.299928, 1606961156], "v": "v0.5"} -{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 56, 56, 64, 1, 1, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [14, 2, 2], 1], ["SP", 3, 8, 56, [2, 1, 2], 1], ["SP", 3, 12, 64, [1, 16, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[7.8291e-05, 7.4365e-05, 6.7147e-05, 6.7413e-05, 8.1894e-05, 7.1771e-05, 7.2916e-05, 6.6615e-05, 7.3038e-05, 7.4967e-05], 0, 1.09095, 1606961258], "v": "v0.5"} -{"i": [["[\"a5612fdeb9db4d579a75ec225ea4c06a\", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 4], ["CA", 2, 5, 3], ["CR", 1], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 5, [0, 1, 2]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$64"]]]], "r": [[2.9217e-05, 3.1065e-05, 3.188e-05, 3.0897e-05, 3.1295e-05, 3.1307e-05, 3.19e-05, 3.1038e-05, 3.1919e-05, 3.2077e-05], 0, 0.217184, 1606961266], "v": "v0.5"} -{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 14, 14, 1024, 1, 1, 1024, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 256, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[0.000153068, 0.000161094, 0.000164674, 0.000160245, 0.000159626, 0.000146788, 0.000140718, 0.000159237, 0.000162109, 0.000139686], 0, 0.273946, 1606961647], "v": "v0.5"} -{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 224, 224, 3, 7, 7, 3, 64, 1, 1, 1, 64, 1, 112, 112, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 112, [1, 1, 4], 1], ["SP", 3, 8, 112, [4, 2, 1], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 7, [7], 1], ["SP", 3, 18, 7, [7], 1], ["SP", 3, 20, 3, [3], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 6, 3], ["FU", 6, [0, 1, 2]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 1, 3, 2], ["AN", 3, 21, 2], ["AN", 6, 9, 2]]]], "r": [[0.000247808, 0.000233393, 0.000251767, 0.000252226, 0.000254169, 0.000254176, 0.00025333, 0.00025511, 0.000253678, 0.000251738], 0, 0.315503, 1606961659], "v": "v0.5"} -{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 1], 1], ["SP", 3, 8, 7, [1, 1, 7], 1], ["SP", 3, 12, 2048, [256, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000169437, 0.000169021, 0.00016965, 0.00017079, 0.000170862, 0.0001692, 0.000164768, 0.000175541, 0.000171528, 0.000169094], 0, 0.25194, 1606961681], "v": "v0.5"} -{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 56, 56, 256, 1, 1, 256, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 1, 1], 1], ["SP", 3, 8, 56, [7, 4, 1], 1], ["SP", 3, 12, 64, [4, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.00015141, 0.000158121, 0.000132758, 0.00015109, 0.000148266, 0.000152599, 0.000150809, 0.000151947, 0.000150702, 0.000156091], 0, 0.221869, 1606961698], "v": "v0.5"} -{"i": [["[\"2350d19dc42a0665244368384c66b3a5\", 1, 56, 56, 64, 3, 3, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [1, 1, 1], 1], ["SP", 3, 8, 56, [7, 1, 4], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 64, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.000221341, 0.000225005, 0.000209954, 0.000209741, 0.000228281, 0.000208451, 0.000223046, 0.000222672, 0.000228098, 0.000220093], 0, 0.231218, 1606961709], "v": "v0.5"} -{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 7, 7, 2048, 1, 1, 2048, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 512, [2, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 2048, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000165941, 0.000152645, 0.000165687, 0.000166639, 0.000166094, 0.00016649, 0.000164394, 0.000169288, 0.000169497, 0.000168535], 0, 0.245559, 1606961724], "v": "v0.5"} -{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 56, 56, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 2, 1], 1], ["SP", 3, 8, 56, [1, 1, 1], 1], ["SP", 3, 12, 256, [2, 4, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 6, 2]]]], "r": [[0.000161206, 0.000161372, 0.000158862, 0.000159596, 0.00014964, 0.000162042, 0.000159626, 0.000158166, 0.000161209, 0.000159408], 0, 0.337652, 1606961748], "v": "v0.5"} -{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 28, 28, 512, 1, 1, 512, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 2, 1], 1], ["SP", 3, 8, 14, [14, 1, 1], 1], ["SP", 3, 12, 1024, [2, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2]]]], "r": [[0.000238006, 0.000235502, 0.000239805, 0.000234637, 0.000235266, 0.000238355, 0.000240836, 0.000232856, 0.000231219, 0.000238776], 0, 0.219506, 1606961782], "v": "v0.5"} -{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 56, 56, 256, 1, 1, 256, 512, 1, 28, 28, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [2, 1, 2], 1], ["SP", 3, 8, 28, [1, 2, 1], 1], ["SP", 3, 12, 512, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000213071, 0.000218117, 0.000216346, 0.000216237, 0.000214703, 0.00021605, 0.000210522, 0.000214234, 0.000218293, 0.00021484], 0, 0.291873, 1606961801], "v": "v0.5"} -{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 14, 14, 1024, 1, 1, 1024, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 2048, [128, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000265306, 0.000259738, 0.000256412, 0.000284932, 0.000267557, 0.000266362, 0.00026533, 0.000263389, 0.000263022, 0.000263069], 0, 0.296232, 1606961838], "v": "v0.5"} -{"i": [["[\"2350d19dc42a0665244368384c66b3a5\", 1, 7, 7, 512, 3, 3, 512, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 8], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CR", 1], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000269786, 0.0002657, 0.000261922, 0.000267462, 0.000270495, 0.000265371, 0.000273858, 0.000268022, 0.000266746, 0.000272337], 0, 0.331923, 1606961848], "v": "v0.5"} -{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 1024, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000159777, 0.00015711, 0.000163052, 0.000152569, 0.00015342, 0.000154918, 0.000153887, 0.000154133, 0.000154319, 0.000150102], 0, 0.195628, 1606961878], "v": "v0.5"} -{"i": [["[\"7006235cfc29b73be524cf390ed5a977\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [56, 1, 1], 1], ["SP", 3, 8, 56, [14, 1, 2], 1], ["SP", 3, 12, 256, [1, 2, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000159044, 0.000157356, 0.000158889, 0.000160304, 0.000158648, 0.000159749, 0.000143679, 0.000156393, 0.000164916, 0.000155957], 0, 0.240777, 1606961918], "v": "v0.5"} -{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 28, 28, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000168259, 0.000157338, 0.0001551, 0.000156552, 0.000160492, 0.000164505, 0.000144937, 0.000138397, 0.000153011, 0.000153186], 0, 0.231498, 1606961965], "v": "v0.5"} -{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 1, 1, 256, 1, 56, 56, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [7, 1, 4], 1], ["SP", 3, 8, 56, [4, 2, 1], 1], ["SP", 3, 12, 256, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3, 4, 5, 6]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 8, 5, 2]]]], "r": [[0.000185957, 0.000180964, 0.000179419, 0.000168205, 0.000176155, 0.000178243, 0.000180175, 0.00017753, 0.000174475, 0.000158878], 0, 0.316404, 1606961979], "v": "v0.5"} -{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 28, 28, 512, 1, 1, 512, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [7, 1, 4], 1], ["SP", 3, 8, 28, [14, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000150378, 0.000154444, 0.000156051, 0.000130306, 0.000156154, 0.000131167, 0.000142357, 0.000152532, 0.000131899, 0.000157696], 0, 0.18509, 1606962002], "v": "v0.5"} +{"i": [["[\"d7b65649a4dd54becea0a52aabbc5af5\", 1, 1000, 1, 1000]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 1, 1000, [50], 1], ["RF", 3, 2, 1], ["RE", 3, [0, 2, 1]], ["SP", 1, 1, 1000, [20], 1], ["RF", 1, 2, 1], ["RE", 1, [0, 2, 1]], ["CR", 6], ["CA", 5, 6, 1], ["CR", 4], ["CA", 2, 3, 1], ["AN", 1, 0, 3], ["FU", 3, [0, 1]], ["AN", 3, 0, 3], ["AN", 4, 0, 3], ["FU", 6, [0, 1]], ["AN", 6, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[8.75e-06, 1.0781e-05, 9.875e-06, 9.836e-06, 1.0357e-05, 1.0238e-05, 1.0341e-05, 9.75e-06, 9.561e-06, 1.0122e-05], 0, 0.17921, 1606960872], "v": "v0.5"} +{"i": [["[\"69115f188984ae34ede37c3b8ca40b43\", 1, 7, 7, 2048, 1, 1, 1, 2048]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CA", 1, 2, 3], ["FU", 2, [0, 1, 2, 3]], ["AN", 2, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"]]]], "r": [[6.28e-06, 8.176e-06, 8.048e-06, 7.942e-06, 7.977e-06, 8.002e-06, 8.093e-06, 7.924e-06, 7.943e-06, 7.924e-06], 0, 0.130759, 1606960900], "v": "v0.5"} +{"i": [["[\"875556d12d0be2269206a7775d5296a6\", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 1, 1, 2048, 1, 1, 1, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 9], ["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 2048, [8, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 10, 0, 3, 2], ["FSP", 10, 3, 4, 2], ["FSP", 10, 6, 5, 2], ["FSP", 10, 9, 6, 2], ["RE", 10, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 10, 7], ["CI", 1], ["FU", 10, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 10, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 10, 4, 2]]]], "r": [[0.000175984, 0.000171372, 0.00018538, 0.000178085, 0.00017879, 0.000179878, 0.000179221, 0.000178598, 0.000176714, 0.000168318], 0, 0.277929, 1606960917], "v": "v0.5"} +{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 14, 14, 1024, 1, 1, 1024, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 7], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 512, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 14, 2]]]], "r": [[0.00012651, 0.00012801, 0.000128605, 0.00013267, 0.00012596, 0.000126418, 0.000121995, 0.000127242, 0.000128152, 0.000129989], 0, 0.310011, 1606960986], "v": "v0.5"} +{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 1, 1, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 1024, [1, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6]], ["AN", 3, 0, 3], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000183629, 0.000188334, 0.000195553, 0.000187308, 0.000196409, 0.000190496, 0.000190344, 0.000188567, 0.000186319, 0.000187136], 0, 0.384722, 1606961002], "v": "v0.5"} +{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 28, 28, 512, 1, 1, 512, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [2, 1, 7], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 256, [16, 4, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["CR", 6], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 14, 2]]]], "r": [[0.000118033, 0.000116806, 0.000134047, 0.000116701, 0.000116219, 0.000116834, 0.000117132, 0.000117029, 0.000116393, 0.000116778], 0, 0.31025, 1606961069], "v": "v0.5"} +{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 1, 1, 512, 1, 28, 28, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 1], 1], ["SP", 3, 8, 28, [7, 1, 1], 1], ["SP", 3, 12, 512, [1, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.00019554, 0.000203491, 0.000199599, 0.000194289, 0.000197556, 0.000199504, 0.000198527, 0.000200656, 0.000200037, 0.000201954], 0, 0.240599, 1606961080], "v": "v0.5"} +{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 56, 56, 256, 1, 1, 256, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [14, 1, 2], 1], ["SP", 3, 8, 28, [2, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [16], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000128461, 0.000158344, 0.000154659, 0.000148478, 0.000162668, 0.000155789, 0.000149412, 0.000141607, 0.000148815, 0.000165989], 0, 0.299928, 1606961156], "v": "v0.5"} +{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 56, 56, 64, 1, 1, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [14, 2, 2], 1], ["SP", 3, 8, 56, [2, 1, 2], 1], ["SP", 3, 12, 64, [1, 16, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[7.8291e-05, 7.4365e-05, 6.7147e-05, 6.7413e-05, 8.1894e-05, 7.1771e-05, 7.2916e-05, 6.6615e-05, 7.3038e-05, 7.4967e-05], 0, 1.09095, 1606961258], "v": "v0.5"} +{"i": [["[\"a5612fdeb9db4d579a75ec225ea4c06a\", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 4], ["CA", 2, 5, 3], ["CR", 1], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 5, [0, 1, 2]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$64"]]]], "r": [[2.9217e-05, 3.1065e-05, 3.188e-05, 3.0897e-05, 3.1295e-05, 3.1307e-05, 3.19e-05, 3.1038e-05, 3.1919e-05, 3.2077e-05], 0, 0.217184, 1606961266], "v": "v0.5"} +{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 14, 14, 1024, 1, 1, 1024, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 256, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[0.000153068, 0.000161094, 0.000164674, 0.000160245, 0.000159626, 0.000146788, 0.000140718, 0.000159237, 0.000162109, 0.000139686], 0, 0.273946, 1606961647], "v": "v0.5"} +{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 224, 224, 3, 7, 7, 3, 64, 1, 1, 1, 64, 1, 112, 112, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 112, [1, 1, 4], 1], ["SP", 3, 8, 112, [4, 2, 1], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 7, [7], 1], ["SP", 3, 18, 7, [7], 1], ["SP", 3, 20, 3, [3], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 6, 3], ["FU", 6, [0, 1, 2]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 1, 3, 2], ["AN", 3, 21, 2], ["AN", 6, 9, 2]]]], "r": [[0.000247808, 0.000233393, 0.000251767, 0.000252226, 0.000254169, 0.000254176, 0.00025333, 0.00025511, 0.000253678, 0.000251738], 0, 0.315503, 1606961659], "v": "v0.5"} +{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 1], 1], ["SP", 3, 8, 7, [1, 1, 7], 1], ["SP", 3, 12, 2048, [256, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000169437, 0.000169021, 0.00016965, 0.00017079, 0.000170862, 0.0001692, 0.000164768, 0.000175541, 0.000171528, 0.000169094], 0, 0.25194, 1606961681], "v": "v0.5"} +{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 56, 56, 256, 1, 1, 256, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 1, 1], 1], ["SP", 3, 8, 56, [7, 4, 1], 1], ["SP", 3, 12, 64, [4, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.00015141, 0.000158121, 0.000132758, 0.00015109, 0.000148266, 0.000152599, 0.000150809, 0.000151947, 0.000150702, 0.000156091], 0, 0.221869, 1606961698], "v": "v0.5"} +{"i": [["[\"2350d19dc42a0665244368384c66b3a5\", 1, 56, 56, 64, 3, 3, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [1, 1, 1], 1], ["SP", 3, 8, 56, [7, 1, 4], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 64, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.000221341, 0.000225005, 0.000209954, 0.000209741, 0.000228281, 0.000208451, 0.000223046, 0.000222672, 0.000228098, 0.000220093], 0, 0.231218, 1606961709], "v": "v0.5"} +{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 7, 7, 2048, 1, 1, 2048, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 512, [2, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 2048, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000165941, 0.000152645, 0.000165687, 0.000166639, 0.000166094, 0.00016649, 0.000164394, 0.000169288, 0.000169497, 0.000168535], 0, 0.245559, 1606961724], "v": "v0.5"} +{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 56, 56, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 2, 1], 1], ["SP", 3, 8, 56, [1, 1, 1], 1], ["SP", 3, 12, 256, [2, 4, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 6, 2]]]], "r": [[0.000161206, 0.000161372, 0.000158862, 0.000159596, 0.00014964, 0.000162042, 0.000159626, 0.000158166, 0.000161209, 0.000159408], 0, 0.337652, 1606961748], "v": "v0.5"} +{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 28, 28, 512, 1, 1, 512, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 2, 1], 1], ["SP", 3, 8, 14, [14, 1, 1], 1], ["SP", 3, 12, 1024, [2, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2]]]], "r": [[0.000238006, 0.000235502, 0.000239805, 0.000234637, 0.000235266, 0.000238355, 0.000240836, 0.000232856, 0.000231219, 0.000238776], 0, 0.219506, 1606961782], "v": "v0.5"} +{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 56, 56, 256, 1, 1, 256, 512, 1, 28, 28, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [2, 1, 2], 1], ["SP", 3, 8, 28, [1, 2, 1], 1], ["SP", 3, 12, 512, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000213071, 0.000218117, 0.000216346, 0.000216237, 0.000214703, 0.00021605, 0.000210522, 0.000214234, 0.000218293, 0.00021484], 0, 0.291873, 1606961801], "v": "v0.5"} +{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 14, 14, 1024, 1, 1, 1024, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 2048, [128, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000265306, 0.000259738, 0.000256412, 0.000284932, 0.000267557, 0.000266362, 0.00026533, 0.000263389, 0.000263022, 0.000263069], 0, 0.296232, 1606961838], "v": "v0.5"} +{"i": [["[\"2350d19dc42a0665244368384c66b3a5\", 1, 7, 7, 512, 3, 3, 512, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 8], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CR", 1], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000269786, 0.0002657, 0.000261922, 0.000267462, 0.000270495, 0.000265371, 0.000273858, 0.000268022, 0.000266746, 0.000272337], 0, 0.331923, 1606961848], "v": "v0.5"} +{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 1024, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000159777, 0.00015711, 0.000163052, 0.000152569, 0.00015342, 0.000154918, 0.000153887, 0.000154133, 0.000154319, 0.000150102], 0, 0.195628, 1606961878], "v": "v0.5"} +{"i": [["[\"7006235cfc29b73be524cf390ed5a977\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [56, 1, 1], 1], ["SP", 3, 8, 56, [14, 1, 2], 1], ["SP", 3, 12, 256, [1, 2, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000159044, 0.000157356, 0.000158889, 0.000160304, 0.000158648, 0.000159749, 0.000143679, 0.000156393, 0.000164916, 0.000155957], 0, 0.240777, 1606961918], "v": "v0.5"} +{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 28, 28, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000168259, 0.000157338, 0.0001551, 0.000156552, 0.000160492, 0.000164505, 0.000144937, 0.000138397, 0.000153011, 0.000153186], 0, 0.231498, 1606961965], "v": "v0.5"} +{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 1, 1, 256, 1, 56, 56, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [7, 1, 4], 1], ["SP", 3, 8, 56, [4, 2, 1], 1], ["SP", 3, 12, 256, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3, 4, 5, 6]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 8, 5, 2]]]], "r": [[0.000185957, 0.000180964, 0.000179419, 0.000168205, 0.000176155, 0.000178243, 0.000180175, 0.00017753, 0.000174475, 0.000158878], 0, 0.316404, 1606961979], "v": "v0.5"} +{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 28, 28, 512, 1, 1, 512, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [7, 1, 4], 1], ["SP", 3, 8, 28, [14, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000150378, 0.000154444, 0.000156051, 0.000130306, 0.000156154, 0.000131167, 0.000142357, 0.000152532, 0.000131899, 0.000157696], 0, 0.18509, 1606962002], "v": "v0.5"} diff --git a/gallery/how_to/tune_with_autoscheduler/ci_logs/sparse_dense.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/sparse_dense.json index 7c1c100124dc..9bf6af0b17d8 100644 --- a/gallery/how_to/tune_with_autoscheduler/ci_logs/sparse_dense.json +++ b/gallery/how_to/tune_with_autoscheduler/ci_logs/sparse_dense.json @@ -1,2 +1,2 @@ # Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI. -{"i": [["[\"sparse_dense\", 512, 512, 512, [9831, 16, 1], [9831], [33], \"float32\"]", "llvm -keys=cpu -link-params=0", [6, 64, 64, 0, 0, 0, 0, 0], "", 1, ["sparse_dense_bsr_512_512_512_16_1_0.60_W_data", "sparse_dense_bsr_512_512_512_16_1_0.60_W_indices", "sparse_dense_bsr_512_512_512_16_1_0.60_W_indptr"]], [[], [["CI", 8], ["CI", 6], ["SP", 5, 0, 512, [1, 8], 1], ["FSP", 9, 0, 2, 1], ["SP", 5, 3, 32, [32], 1], ["FSP", 9, 2, 4, 1], ["RE", 5, [0, 3, 1, 4, 6, 2, 5, 7]], ["RE", 9, [0, 2, 1, 3]], ["CA", 5, 9, 1], ["CI", 4], ["FU", 9, [0, 1]], ["AN", 9, 0, 3], ["PR", 5, 0, "auto_unroll_max_step$0"], ["AN", 9, 2, 2]]]], "r": [[0.000957008], 0, 0.605709, 1614689820], "v": "v0.6"} +{"i": [["[\"sparse_dense\", 512, 512, 512, [9831, 16, 1], [9831], [33], \"float32\"]", "llvm -keys=cpu", [6, 64, 64, 0, 0, 0, 0, 0], "", 1, ["sparse_dense_bsr_512_512_512_16_1_0.60_W_data", "sparse_dense_bsr_512_512_512_16_1_0.60_W_indices", "sparse_dense_bsr_512_512_512_16_1_0.60_W_indptr"]], [[], [["CI", 8], ["CI", 6], ["SP", 5, 0, 512, [1, 8], 1], ["FSP", 9, 0, 2, 1], ["SP", 5, 3, 32, [32], 1], ["FSP", 9, 2, 4, 1], ["RE", 5, [0, 3, 1, 4, 6, 2, 5, 7]], ["RE", 9, [0, 2, 1, 3]], ["CA", 5, 9, 1], ["CI", 4], ["FU", 9, [0, 1]], ["AN", 9, 0, 3], ["PR", 5, 0, "auto_unroll_max_step$0"], ["AN", 9, 2, 2]]]], "r": [[0.000957008], 0, 0.605709, 1614689820], "v": "v0.6"} diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_x86.py b/gallery/how_to/tune_with_autotvm/tune_relay_x86.py index 6e46fbd8ffc8..2ba597d1da19 100644 --- a/gallery/how_to/tune_with_autotvm/tune_relay_x86.py +++ b/gallery/how_to/tune_with_autotvm/tune_relay_x86.py @@ -298,7 +298,7 @@ def tune_and_evaluate(tuning_opt): # # Evaluation of the network been tuned on graph level: # Compile... -# Config for target=llvm -keys=cpu -link-params=0, workload=('dense_nopack.x86', ('TENSOR', (1, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32') is missing in ApplyGraphBest context. A fallback configuration is used, which may bring great performance regression. -# Config for target=llvm -keys=cpu -link-params=0, workload=('dense_pack.x86', ('TENSOR', (1, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32') is missing in ApplyGraphBest context. A fallback configuration is used, which may bring great performance regression. +# Config for target=llvm -keys=cpu, workload=('dense_nopack.x86', ('TENSOR', (1, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32') is missing in ApplyGraphBest context. A fallback configuration is used, which may bring great performance regression. +# Config for target=llvm -keys=cpu, workload=('dense_pack.x86', ('TENSOR', (1, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32') is missing in ApplyGraphBest context. A fallback configuration is used, which may bring great performance regression. # Evaluate inference time cost... # Mean inference time (std dev): 3.16 ms (0.03 ms) diff --git a/gallery/how_to/work_with_microtvm/micro_tvmc.sh b/gallery/how_to/work_with_microtvm/micro_tvmc.sh index 5ec718884559..0eaef9c6a836 100755 --- a/gallery/how_to/work_with_microtvm/micro_tvmc.sh +++ b/gallery/how_to/work_with_microtvm/micro_tvmc.sh @@ -99,7 +99,7 @@ wget https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/e # # bash tvmc compile magic_wand.tflite \ - --target='c -keys=cpu -link-params=0 -model=host' \ + --target='c -keys=cpu -model=host' \ --runtime=crt \ --runtime-crt-system-lib 1 \ --executor='graph' \ @@ -111,7 +111,7 @@ tvmc compile magic_wand.tflite \ # bash # This will generate a ``model.tar`` file which contains TVM compiler output files. To run this command for # a different Zephyr device, you need to update ``target``. For instance, for ``nrf5340dk_nrf5340_cpuapp`` board -# the target is ``--target='c -keys=cpu -link-params=0 -model=nrf5340dk'``. +# the target is ``--target='c -keys=cpu -model=nrf5340dk'``. # diff --git a/gallery/tutorial/auto_scheduler_matmul_x86.py b/gallery/tutorial/auto_scheduler_matmul_x86.py index 279987f00d81..98fd95c33878 100644 --- a/gallery/tutorial/auto_scheduler_matmul_x86.py +++ b/gallery/tutorial/auto_scheduler_matmul_x86.py @@ -44,8 +44,6 @@ testing.utils.install_request_hook(depth=3) # sphinx_gallery_end_ignore -import os - import numpy as np import tvm from tvm import te, auto_scheduler diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py index 6b61b6f4ba55..f735c81ee0aa 100644 --- a/python/tvm/contrib/hexagon/pytest_plugin.py +++ b/python/tvm/contrib/hexagon/pytest_plugin.py @@ -245,7 +245,7 @@ def terminate_rpc_servers(): aot_host_target = tvm.testing.parameter( "c", - "llvm -keys=hexagon -link-params=0 " + "llvm -keys=hexagon " "-mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp " "-mcpu=hexagonv68 -mtriple=hexagon", ) diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py index f3de1a085692..6cdc79ceb587 100644 --- a/python/tvm/relay/build_module.py +++ b/python/tvm/relay/build_module.py @@ -274,69 +274,6 @@ def _build_module_no_factory(mod, target=None, target_host=None, params=None, mo return _build_module_no_factory_impl(mod, target, target_host, params, mod_name) -def _reconstruct_from_deprecated_options(deprecated_params_target): - executor = None - runtime = None - - deprecated_executor = None - deprecated_executor_args = {} - if "executor" in deprecated_params_target.attrs: - _deprecated_target_param_warning("Executor", "executor") - deprecated_executor = deprecated_params_target.attrs.get("executor", "graph") - if "interface-api" in deprecated_params_target.attrs: - _deprecated_target_sub_param_warning("Executor", "interface-api") - deprecated_executor_args.update( - {"interface-api": deprecated_params_target.attrs["interface-api"]} - ) - if "unpacked-api" in deprecated_params_target.attrs: - _deprecated_target_sub_param_warning("Executor", "unpacked-api") - deprecated_executor_args.update( - {"unpacked-api": deprecated_params_target.attrs["unpacked-api"]} - ) - if ( - "link-params" in deprecated_params_target.attrs - and deprecated_params_target.attrs["link-params"] - ): - _deprecated_target_sub_param_warning("Executor", "link-params") - if deprecated_executor != "aot": - deprecated_executor_args.update( - {"link-params": deprecated_params_target.attrs["link-params"]} - ) - if deprecated_executor or deprecated_executor_args: - executor = Executor(deprecated_executor or "graph", deprecated_executor_args) - - deprecated_runtime = None - deprecated_runtime_args = {} - if "runtime" in deprecated_params_target.attrs: - _deprecated_target_param_warning("Runtime", "runtime") - deprecated_runtime = deprecated_params_target.attrs.get("runtime", "cpp") - if deprecated_runtime == "c": - deprecated_runtime = "crt" - if "system-lib" in deprecated_params_target.attrs: - _deprecated_target_sub_param_warning("Runtime", "system-lib") - deprecated_runtime_args.update({"system-lib": deprecated_params_target.attrs["system-lib"]}) - if deprecated_runtime or deprecated_runtime_args: - runtime = Runtime(deprecated_runtime or "cpp", deprecated_runtime_args) - - return executor, runtime - - -def _deprecated_target_param_warning(registry, param): - warnings.warn( - f"Please use {registry} (tvm.relay.backend.{registry}) " - f"instead of deprecated Target parameter -{param}", - DeprecationWarning, - ) - - -def _deprecated_target_sub_param_warning(registry, param): - warnings.warn( - f"Please use {registry} (tvm.relay.backend.{registry}) parameter {param} " - f"instead of deprecated Target parameter -{param}", - DeprecationWarning, - ) - - def build( ir_mod, target=None, @@ -415,17 +352,6 @@ def build( assert len(raw_targets) > 0 target_host = raw_targets[0].host - # All of this logic is to raise deprecation warnings for various parameters - # TODO(Mousius) Remove these after some time - deprecated_params_target = target_host or list(raw_targets)[0] - deprecated_executor, deprecated_runtime = _reconstruct_from_deprecated_options( - deprecated_params_target - ) - if deprecated_executor: - executor = deprecated_executor - if deprecated_runtime: - runtime = deprecated_runtime - # If current dispatch context is fallback context (the default root context), # then load pre-tuned parameters from TopHub if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext): @@ -756,9 +682,5 @@ def create_executor(kind="debug", mod=None, device=None, target="llvm", params=N if kind == "vm": return VMExecutor(mod, device, raw_targets) if kind == "aot": - # The AOT requires the executor as a target attribute. - # (The compilation paths for the other executors currently do not always provide this - # attribute, hence the above generic assert is more forgiving). - assert "executor" in raw_targets[0].attrs return AotExecutor(mod, device, raw_targets) raise RuntimeError("unknown execution strategy: {0}".format(kind)) diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py index e0e5f0177b5e..a558fcbeaf5b 100644 --- a/python/tvm/target/target.py +++ b/python/tvm/target/target.py @@ -636,8 +636,6 @@ def hexagon(cpu_ver="v66", **kwargs): Whether to use QFloat HVX instructions. use_ieee_fp : bool (default: False) Whether to use IEEE HVX instructions - link_params : bool (default: False) - Whether to link graph parameters into the LLVM module. Note: Floating point support in HVX requires LLVM 14+. """ @@ -671,7 +669,6 @@ def get_arch_version(cpu_ver): "llvm_options": None, "use_qfloat": arch_version >= 68, "use_ieee_fp": False, - "link_params": False, } config.update(kwargs) @@ -738,24 +735,10 @@ def create_llvm_options(cpu_ver, config): # pylint: disable=unused-argument args = [s.replace("=", "@") for s in llvm_options.split()] return "--llvm-options=" + ",".join(args) - # TVM target attributes string - def create_tvm_options(cpu_ver, config): # pylint: disable=unused-argument - """Create TVM target features string.""" - - features = { - "link_params": "link-params", - } - opts = "" - for k in config: - if k in features: - opts += " --" + features[k] + "=" + str(config[k]) - return opts - target_str = create_llvm_target(cpu_ver, config) llvm_str = create_llvm_options(cpu_ver, config) - tvm_str = create_tvm_options(cpu_ver, config) - args_list = target_str.split() + llvm_str.split() + tvm_str.split() + args_list = target_str.split() + llvm_str.split() return Target(" ".join(["hexagon"] + args_list)) diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc index 38ee536e7818..e3b2d7b096fd 100644 --- a/src/target/target_kind.cc +++ b/src/target/target_kind.cc @@ -264,12 +264,7 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU) .add_attr_option("mtriple") .add_attr_option("mfloat-abi") .add_attr_option("mabi") - .add_attr_option("system-lib") - .add_attr_option("runtime") .add_attr_option("num-cores") - .add_attr_option("link-params", Bool(false)) - .add_attr_option("unpacked-api") - .add_attr_option("interface-api") // Fast math flags, see https://llvm.org/docs/LangRef.html#fast-math-flags .add_attr_option("fast-math") // implies all the below .add_attr_option("fast-math-nnan") @@ -310,23 +305,16 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU) // Hence the type is "uint". TVM_REGISTER_TARGET_KIND("c", kDLCPU) - .add_attr_option("system-lib") - .add_attr_option("link-params", Bool(false)) - .add_attr_option("runtime") .add_attr_option("mcpu") .add_attr_option("march") - .add_attr_option("executor") .add_attr_option("workspace-byte-alignment") .add_attr_option("constants-byte-alignment") - .add_attr_option("unpacked-api") - .add_attr_option("interface-api") .set_default_keys({"cpu"}) .set_target_parser(tvm::target::parsers::cpu::ParseTarget); TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA) .add_attr_option("mcpu") .add_attr_option("arch") - .add_attr_option("system-lib") .add_attr_option("max_shared_memory_per_block") .add_attr_option("max_threads_per_block") .add_attr_option("thread_warp_size", Integer(32)) @@ -338,7 +326,6 @@ TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA) TVM_REGISTER_TARGET_KIND("nvptx", kDLCUDA) .add_attr_option("mcpu") .add_attr_option("mtriple") - .add_attr_option("system-lib") .add_attr_option("max_num_threads", Integer(1024)) .add_attr_option("thread_warp_size", Integer(32)) .set_default_keys({"cuda", "gpu"}) @@ -348,7 +335,6 @@ TVM_REGISTER_TARGET_KIND("rocm", kDLROCM) .add_attr_option("mcpu") .add_attr_option("mtriple") .add_attr_option>("mattr") - .add_attr_option("system-lib") // TODO(masahi): Support querying from a target device // On RDNA cards, thread_warp_size should be 32 .add_attr_option("max_num_threads", Integer(256)) @@ -359,7 +345,6 @@ TVM_REGISTER_TARGET_KIND("rocm", kDLROCM) .set_target_parser(UpdateROCmAttrs); TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL) - .add_attr_option("system-lib") .add_attr_option("max_num_threads", Integer(256)) .add_attr_option("thread_warp_size", Integer(1)) .add_attr_option("texture_spatial_limit", Integer(16384)) @@ -370,7 +355,6 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL) // information about this limitation can be found here: // https://developer.apple.com/documentation/metal/buffers/about_argument_buffers?language=objc TVM_REGISTER_TARGET_KIND("metal", kDLMetal) - .add_attr_option("system-lib") .add_attr_option("max_num_threads", Integer(256)) .add_attr_option("thread_warp_size", Integer(16)) .add_attr_option("max_function_args", Integer(31)) @@ -378,7 +362,6 @@ TVM_REGISTER_TARGET_KIND("metal", kDLMetal) TVM_REGISTER_TARGET_KIND("vulkan", kDLVulkan) .add_attr_option>("mattr") - .add_attr_option("system-lib") // Feature support .add_attr_option("supports_float16") .add_attr_option("supports_float32", Bool(true)) @@ -417,39 +400,30 @@ TVM_REGISTER_TARGET_KIND("vulkan", kDLVulkan) .set_default_keys({"vulkan", "gpu"}); TVM_REGISTER_TARGET_KIND("webgpu", kDLWebGPU) - .add_attr_option("system-lib") .add_attr_option("max_num_threads", Integer(256)) .set_default_keys({"webgpu", "gpu"}); -TVM_REGISTER_TARGET_KIND("sdaccel", kDLOpenCL) - .add_attr_option("system-lib") +TVM_REGISTER_TARGET_KIND("sdaccel", kDLOpenCL) // line break .set_default_keys({"sdaccel", "hls"}); -TVM_REGISTER_TARGET_KIND("aocl", kDLAOCL) - .add_attr_option("system-lib") +TVM_REGISTER_TARGET_KIND("aocl", kDLAOCL) // line break .set_default_keys({"aocl", "hls"}); -TVM_REGISTER_TARGET_KIND("aocl_sw_emu", kDLAOCL) - .add_attr_option("system-lib") +TVM_REGISTER_TARGET_KIND("aocl_sw_emu", kDLAOCL) // line break .set_default_keys({"aocl", "hls"}); TVM_REGISTER_TARGET_KIND("hexagon", kDLHexagon) .add_attr_option>("mattr") .add_attr_option("mcpu") .add_attr_option("mtriple") - .add_attr_option("system-lib") - .add_attr_option("link-params", Bool(false)) .add_attr_option>("llvm-options") .set_default_keys({"hexagon"}); -TVM_REGISTER_TARGET_KIND("stackvm", kDLCPU) // line break - .add_attr_option("system-lib"); +TVM_REGISTER_TARGET_KIND("stackvm", kDLCPU); -TVM_REGISTER_TARGET_KIND("ext_dev", kDLExtDev) // line break - .add_attr_option("system-lib"); +TVM_REGISTER_TARGET_KIND("ext_dev", kDLExtDev); -TVM_REGISTER_TARGET_KIND("hybrid", kDLCPU) // line break - .add_attr_option("system-lib"); +TVM_REGISTER_TARGET_KIND("hybrid", kDLCPU); TVM_REGISTER_TARGET_KIND("composite", kDLCPU) // line break .add_attr_option>("devices"); diff --git a/tests/cpp/c_codegen_test.cc b/tests/cpp/c_codegen_test.cc index 442f76a8cff3..e764d21505d4 100644 --- a/tests/cpp/c_codegen_test.cc +++ b/tests/cpp/c_codegen_test.cc @@ -33,7 +33,7 @@ TEST(CCodegen, MainFunctionOrder) { std::string tvm_module_main = std::string(runtime::symbol::tvm_module_main); - tvm::Target target_c = tvm::Target("c -keys=cpu -link-params=0"); + tvm::Target target_c = tvm::Target("c -keys=cpu"); const int n = 4; Array shape{n}; @@ -104,16 +104,16 @@ TEST(CCodegen, FunctionOrder) { using namespace tvm; using namespace tvm::te; - Target target = Target("c -keys=cpu -link-params=0"); + Target target = Target("c -keys=cpu"); // add schedules in reverse order Map inputs; - inputs.Set(Target("c -keys=cpu -link-params=0"), BuildLowered("op_2", target)); - inputs.Set(Target("c -keys=cpu -link-params=0"), BuildLowered("op_1", target)); + inputs.Set(Target("c -keys=cpu"), BuildLowered("op_2", target)); + inputs.Set(Target("c -keys=cpu"), BuildLowered("op_1", target)); for (uint32_t counter = 99; IsSorted(inputs) && counter > 0; counter--) { std::string op_name = "op_" + std::to_string(counter); - inputs.Set(Target("c -keys=cpu -link-params=0"), BuildLowered(op_name, target)); + inputs.Set(Target("c -keys=cpu"), BuildLowered(op_name, target)); } EXPECT_FALSE(IsSorted(inputs)); diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc index f238393ce923..37a8eeb44840 100644 --- a/tests/cpp/target_test.cc +++ b/tests/cpp/target_test.cc @@ -493,9 +493,8 @@ TEST(TargetCreation, DeduplicateKeys) { ICHECK_EQ(target->keys.size(), 2U); ICHECK_EQ(target->keys[0], "cpu"); ICHECK_EQ(target->keys[1], "arm_cpu"); - ICHECK_EQ(target->attrs.size(), 2U); + ICHECK_EQ(target->attrs.size(), 1U); ICHECK_EQ(target->GetAttr("device"), "arm_cpu"); - ICHECK_EQ(target->GetAttr("link-params"), false); } TEST(TargetKindRegistry, ListTargetKinds) { @@ -511,5 +510,4 @@ TEST(TargetKindRegistry, ListTargetOptions) { ICHECK_EQ(attrs["mattr"], "Array"); ICHECK_EQ(attrs["mcpu"], "runtime.String"); - ICHECK_EQ(attrs["system-lib"], "IntImm"); } diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py index 9bbecdd7f81b..91b51cb5cc75 100644 --- a/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py +++ b/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py @@ -14,7 +14,6 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import pytest import numpy as np import tvm diff --git a/tests/python/driver/tvmc/test_target.py b/tests/python/driver/tvmc/test_target.py index 4438ec437cb4..39e90e6d6ac4 100644 --- a/tests/python/driver/tvmc/test_target.py +++ b/tests/python/driver/tvmc/test_target.py @@ -114,9 +114,7 @@ def test_parse_multiple_target(): def test_parse_hybrid_target(): """Hybrid Target and external codegen""" - targets = parse_target( - "cmsis-nn -accelerator_config=ethos-u55-256, llvm -device=arm_cpu --system-lib" - ) + targets = parse_target("cmsis-nn -accelerator_config=ethos-u55-256, llvm -device=arm_cpu") assert len(targets) == 2 assert "cmsis-nn" == targets[0]["name"] @@ -154,7 +152,7 @@ def test_parse_quotes_and_separators_on_options(): def test_parse_multiple_target_with_opts_ethos_n78(): - targets = parse_target("ethos-n -myopt=value, llvm -device=arm_cpu --system-lib") + targets = parse_target("ethos-n -myopt=value, llvm -device=arm_cpu") assert len(targets) == 2 assert "ethos-n" == targets[0]["name"] diff --git a/tests/python/driver/tvmc/test_target_options.py b/tests/python/driver/tvmc/test_target_options.py index c73dc288cdd8..891df86f0c1f 100644 --- a/tests/python/driver/tvmc/test_target_options.py +++ b/tests/python/driver/tvmc/test_target_options.py @@ -86,7 +86,7 @@ def test_skip_target_from_codegen(): def test_target_recombobulation_single(): tvm_target, _ = target_from_cli("llvm", {"llvm": {"mcpu": "cortex-m3"}}) - assert str(tvm_target) == "llvm -keys=arm_cpu,cpu -link-params=0 -mcpu=cortex-m3" + assert str(tvm_target) == "llvm -keys=arm_cpu,cpu -mcpu=cortex-m3" def test_target_recombobulation_many(): diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py index 3f641c995652..4ffe302763f8 100644 --- a/tests/python/relay/aot/test_cpp_aot.py +++ b/tests/python/relay/aot/test_cpp_aot.py @@ -169,7 +169,7 @@ def test_create_executor(): x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32")) expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32")))) actual = relay.create_executor( - "aot", mod=tvm.IRModule.from_expr(tvm.relay.Function([x], expr)), target="c -executor=aot" + "aot", mod=tvm.IRModule.from_expr(tvm.relay.Function([x], expr)), target="c" ).evaluate()(np.array([2], dtype="float32")) np.isfinite(np.array([3], dtype="float32")) diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py index edf23ff22781..c3426f147e0d 100644 --- a/tests/python/relay/aot/test_crt_aot.py +++ b/tests/python/relay/aot/test_crt_aot.py @@ -710,42 +710,6 @@ def test_name_sanitiser_name_clash(): ) -# This tests for deprecated AOT executor arguments -# TODO(Mousius) Remove deprecated arguments later -def test_deprecated_target_arguments(): - """Tests we can still use relay.build with -executor, -runtime and -link-params""" - - interface_api = "c" - use_unpacked_api = True - test_runner = AOT_DEFAULT_RUNNER - - input_x = relay.var("x", shape=(1, 10)) - input_y = relay.var("y", shape=(1, 10)) - func_add = relay.add(input_x, input_y) - func = relay.Function([input_x, input_y], func_add) - - x_in = np.ones((1, 10)).astype("float32") - y_in = np.random.uniform(size=(1, 10)).astype("float32") - - params = {"x": x_in} - inputs = {"y": y_in} - output_list = generate_ref_data(func, inputs, params) - - compile_and_run( - AOTTestModel( - module=IRModule.from_expr(func), - inputs=inputs, - outputs=output_list, - params=params, - ), - test_runner, - interface_api, - use_unpacked_api, - use_runtime_executor=False, - target="c -executor=aot --link-params -runtime=c -interface-api=c --unpacked-api", - ) - - def test_aot_codegen_backend_alloc_workspace_calls(): """This test checks whether AoT lowering creates TVMBackendAllocWorkspace calls""" diff --git a/tests/python/relay/test_build_module.py b/tests/python/relay/test_build_module.py index d51cfd29dc97..5cfc27330aff 100644 --- a/tests/python/relay/test_build_module.py +++ b/tests/python/relay/test_build_module.py @@ -22,48 +22,23 @@ from tvm import relay from tvm.target.target import Target from tvm.relay.backend import Runtime, Executor, graph_executor_codegen -from tvm.relay.build_module import _reconstruct_from_deprecated_options @pytest.mark.parametrize( - "target,executor,runtime", + "test_target,unsupported_config", [ - [Target("c"), None, None], - [Target("c -runtime=c"), None, Runtime("crt")], - [Target("c -system-lib"), None, Runtime("cpp", {"system-lib": True})], - [Target("c -runtime=c -system-lib"), None, Runtime("crt", {"system-lib": True})], - [Target("c -executor=aot"), Executor("aot"), None], - [ - Target("c -executor=aot -interface-api=c"), - Executor("aot", {"interface-api": "c"}), - None, - ], - [ - Target("c -executor=aot -unpacked-api=1"), - Executor("aot", {"unpacked-api": 1}), - None, - ], - [Target("c -executor=aot -link-params=1"), Executor("aot"), None], - [Target("c -link-params=1"), Executor("graph", {"link-params": 1}), None], - [ - Target( - "c -executor=aot -link-params=1 -interface-api=c" - " -unpacked-api=1 -runtime=c -system-lib" - ), - Executor("aot", {"unpacked-api": 1, "interface-api": "c"}), - Runtime("crt", {"system-lib": True}), - ], + ["c", "-runtime=c"], + ["c", "-system-lib=1"], + ["c", "-executor=aot"], + ["c", "-interface-api=c"], + ["c", "-unpacked-api=1"], + ["c", "-link-params=1"], ], ) -def test_deprecated_target_parameters(target, executor, runtime): - actual_executor, actual_runtime = _reconstruct_from_deprecated_options(target) - - assert (executor is None and actual_executor is None) or (executor.name == actual_executor.name) - # sort as TVM Map cannot guarantee round-trip order. - assert (executor is None and actual_executor is None) or ( - sorted(executor.attrs.items()) == sorted(actual_executor.attrs.items()) - ) - assert runtime == actual_runtime +def test_deprecated_target_parameters(test_target, unsupported_config): + with pytest.raises(ValueError) as e_info: + Target(f"{test_target} {unsupported_config}") + assert f"Cannot recognize '{unsupported_config}" in str(e_info.execption) def test_build_relay_graph_(): diff --git a/tests/python/unittest/test_tir_transform_common_subexpr_elim.py b/tests/python/unittest/test_tir_transform_common_subexpr_elim.py index a546c16a648e..be229a580f01 100644 --- a/tests/python/unittest/test_tir_transform_common_subexpr_elim.py +++ b/tests/python/unittest/test_tir_transform_common_subexpr_elim.py @@ -449,7 +449,7 @@ def test_deterministic_cse(): # Needed for the second test on determinism LOG_LINE = '{"i": [["[\\"conv2d_layer\\", 1, 7, 7, 512, 512, 3, 3, [1, 1], [1, 1]]", \ - "llvm -keys=cpu -link-params=0 -mcpu=broadwell -num-cores=2", \ + "llvm -keys=cpu -mcpu=broadwell -num-cores=2", \ [8, 64, 64, 0, 0, 0, 0, 0], "", 1, []], [[], [["CI", 5], \ ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 512, [1, 32, 16], 1], \ ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 7, [1, 1, 1], 1], \ diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py index 0a2cec6011ef..e5f5ae752aac 100644 --- a/tests/python/unittest/test_tvmscript_roundtrip.py +++ b/tests/python/unittest/test_tvmscript_roundtrip.py @@ -3087,9 +3087,7 @@ def func_with_target_spec_by_config() -> None: "kind": "cuda", "tag": "", "keys": ["cuda", "gpu"], - "host": T.target( - {"kind": "llvm", "tag": "", "keys": ["cpu"], "link-params": False} - ), + "host": T.target({"kind": "llvm", "tag": "", "keys": ["cpu"]}), } ) } diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh index 8b390c962e98..d8578fde2817 100755 --- a/tests/scripts/task_python_docs.sh +++ b/tests/scripts/task_python_docs.sh @@ -83,7 +83,7 @@ IGNORED_WARNINGS=( 'strategy:depthwise_conv2d NHWC layout is not optimized for x86 with autotvm.' 'strategy:depthwise_conv2d with layout NHWC is not optimized for arm cpu.' 'strategy:dense is not optimized for arm cpu.' - 'autotvm:Cannot find config for target=llvm -keys=cpu -link-params=0' + 'autotvm:Cannot find config for target=llvm -keys=cpu' 'autotvm:One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.' 'autotvm:Cannot find config for target=cuda -keys=cuda,gpu' # Warning is thrown during TFLite quantization for micro_train tutorial From 577826182ff5c0029348b66b8f977c29e21c4ad4 Mon Sep 17 00:00:00 2001 From: crawlingcub <86861129+crawlingcub@users.noreply.github.com> Date: Wed, 24 Aug 2022 02:08:47 -0500 Subject: [PATCH 021/704] [PyTorch][Fix] Fix for numerically unstable logsigmoid (#12563) * Fix numerical instability for log sigmoid Fix numerical instability for log sigmoid in pytorch frontend * update * add test for overflow check * merging two tests --- python/tvm/relay/frontend/pytorch.py | 4 +++- tests/python/frontend/pytorch/test_forward.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index 8ed94c2a81c9..04a25c86b799 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -911,7 +911,9 @@ def glu(self, inputs, input_types): def log_sigmoid(self, inputs, input_types): data = inputs[0] - return _op.log(_op.tensor.sigmoid(data)) + mn = _op.minimum(_op.const(0, dtype=input_types[0]), data) + z = _op.exp(-_op.abs(data)) + return mn - self.log1p([z], input_types) def cross_entropy_loss_with_logits(self, inputs, input_types): input = inputs[0] diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index a030c5141a31..7e00770cd593 100755 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -811,7 +811,9 @@ def test_forward_log_sigmoid(): torch.set_grad_enabled(False) input_shape = [10, 10] input_data = torch.rand(input_shape).float() + input_data_overflow = torch.tensor([-300.0, -100.0]).float() verify_model(torch.nn.LogSigmoid().eval(), input_data=input_data) + verify_model(torch.nn.LogSigmoid().eval(), input_data=input_data_overflow) @tvm.testing.uses_gpu From e468dc28eac3c78a3c70c2b1616c6345d4767eab Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Wed, 24 Aug 2022 08:10:59 +0100 Subject: [PATCH 022/704] [microNPU] Force compute_cycles_hint to be interpreted as an int64 value (#12558) `compute_cycles` can be the size of an int64 value, however it seems that when that value is attached to the IR as a pragma from Python, it is interpreted as an `int`, rather than `int64_t`. This commit adds an explicit cast to ensure the value is interpreted correctly. The reason these values started appearing very large and randomly is still yet to be solved, although the hope is that this fix will unblock CI. Change-Id: Idcdd7d37af1acd665590c87624446a025b50eb3d --- python/tvm/contrib/ethosu/cascader/scheduler.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py index 2c804a3b3b64..5ebc95d7ef88 100644 --- a/python/tvm/contrib/ethosu/cascader/scheduler.py +++ b/python/tvm/contrib/ethosu/cascader/scheduler.py @@ -154,7 +154,11 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) -> None: # Attach AttrStmt directly to npu op so it isn't removed by ReplaceOperators npu_op = part.subgraph.output_tensor.op.input_tensors[0].op.input_tensors[0] - sch[npu_op].pragma(npu_op.op.axis[0], "compute_cycles_hint", compute_cycles) + # Force the pragma to interpret the compute cycles as an int64 value + compute_cycles_int64_cast = tvm.tir.IntImm("int64", compute_cycles) + sch[npu_op].pragma( + npu_op.op.axis[0], "compute_cycles_hint", compute_cycles_int64_cast + ) output_tensor_config = plan.output_config output_tensor = output_tensor_config.tensor From 90b2f0d36996be10d71f0c923f588c6dfa0e8546 Mon Sep 17 00:00:00 2001 From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com> Date: Wed, 24 Aug 2022 08:13:30 +0100 Subject: [PATCH 023/704] [CI][CMSIS-NN] Running tests parallel using pytest-xdist (#12557) Introducing -n auto for CMSIS-NN tests to run them in parallel with pytest-xdist. This is needed because of additional parameterization done over cpu variants. Change-Id: I02e1b37ead0b0a562b5b1b2dacfeb3fdd7cc1ce3 --- tests/scripts/task_python_microtvm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh index e65f2253bb28..a2ef53a123bf 100755 --- a/tests/scripts/task_python_microtvm.sh +++ b/tests/scripts/task_python_microtvm.sh @@ -57,5 +57,5 @@ python3 gallery/how_to/work_with_microtvm/micro_aot.py run_pytest ctypes python-relay-strategy-arm_cpu tests/python/relay/strategy/arm_cpu --enable-corstone300-tests run_pytest ctypes python-integration-m7-simd tests/python/integration/test_arm_mprofile_dsp.py --enable-corstone300-tests -run_pytest ctypes python-integration-contrib-test_cmsisnn tests/python/contrib/test_cmsisnn +run_pytest ctypes python-integration-contrib-test_cmsisnn tests/python/contrib/test_cmsisnn -n auto run_pytest ctypes python-integration-contrib-test_ethosu tests/python/contrib/test_ethosu -n auto From 989e5a11285503716c2033f4e56f1bba6b6d00c7 Mon Sep 17 00:00:00 2001 From: Nicola Lancellotti Date: Wed, 24 Aug 2022 08:16:45 +0100 Subject: [PATCH 024/704] [ETHOSN] Add support for resize (#12535) This commit adds support for the `resize` operator for Arm(R) Ethos(TM)-N NPU. --- python/tvm/relay/op/contrib/ethosn.py | 15 ++ src/relay/backend/contrib/ethosn/codegen.cc | 39 +++++ .../backend/contrib/ethosn/codegen_ethosn.h | 1 + .../backend/contrib/ethosn/ethosn_api.cc | 40 ++++++ src/relay/backend/contrib/ethosn/ethosn_api.h | 8 ++ .../python/contrib/test_ethosn/test_resize.py | 134 ++++++++++++++++++ 6 files changed, 237 insertions(+) create mode 100644 tests/python/contrib/test_ethosn/test_resize.py diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py index eb753ef1391f..469939ecf0b8 100644 --- a/python/tvm/relay/op/contrib/ethosn.py +++ b/python/tvm/relay/op/contrib/ethosn.py @@ -176,6 +176,13 @@ def qnn_requantize_pattern(): ) return pattern + def qnn_resize_pattern(): + pattern = is_op("image.resize2d")(wildcard()).has_attr({"method": "nearest_neighbor"}) + pattern = is_op("qnn.requantize")( + pattern, is_constant(), is_constant(), is_constant(), is_constant() + ) + return pattern + def check_conv2d(extract): """Check if a conv2d is supported by Ethos-N.""" if not ethosn_available(): @@ -232,6 +239,13 @@ def check_requantize(extract): return support.requantize(extract) + def check_resize(extract): + """Check if resize (nearest neighbor) is supported.""" + if not ethosn_available(): + return False + + return support.resize(extract) + return [ ("ethos-n.qnn_conv2d", qnn_conv_pattern(), check_conv2d), ("ethos-n.qnn_avg_pool2d", qnn_avg_pool2d_pattern(), check_avg_pool2d), @@ -240,6 +254,7 @@ def check_requantize(extract): ("ethos-n.qnn_mean", qnn_mean_pattern(), check_mean), ("ethos-n.qnn_tanh", qnn_tanh_pattern(), check_tanh), ("ethos-n.qnn_leaky_relu", qnn_leaky_relu_pattern(), check_leaky_relu), + ("ethos-n.qnn_resize", qnn_resize_pattern(), check_resize), ("ethos-n.qnn_requantize", qnn_requantize_pattern(), check_requantize), ] diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc index f5cce30e4521..bc4613b80155 100644 --- a/src/relay/backend/contrib/ethosn/codegen.cc +++ b/src/relay/backend/contrib/ethosn/codegen.cc @@ -148,6 +148,10 @@ void InferTensorsVisitor::InferCall(const CallNode* cn) { RequantizeParams params; err += EthosnAPI::Requantize(cn->op.as()->body, ¶ms); tensor_table_[cn->args[0]] = {params.input_info}; + } else if (IsEthosnFunc(call, "ethos-n.qnn_resize")) { + ResizeParams params; + err += EthosnAPI::Resize(cn->op.as()->body, ¶ms); + tensor_table_[cn->args[0]] = {params.input_info}; } else { err = EthosnError("unknown operator"); } @@ -322,6 +326,9 @@ sl::TensorsAndId ConstructNetworkVisitor::HandleCall(const CallNode* cn) { } else if (IsEthosnFunc(call, "ethos-n.qnn_requantize")) { if ((err = MakeRequantizeLayer(call, &tensor))) ReportFatalError(call, err); return MakeOps(tensor); + } else if (IsEthosnFunc(call, "ethos-n.qnn_resize")) { + if ((err = MakeResizeLayer(call, &tensor))) ReportFatalError(call, err); + return MakeOps(tensor); } else { ReportFatalError(call, EthosnError("unknown operator")); return {}; @@ -622,6 +629,24 @@ EthosnError ConstructNetworkVisitor::MakeRequantizeLayer(const Call& call, return EthosnError(); } +EthosnError ConstructNetworkVisitor::MakeResizeLayer(const Call& call, + sl::TensorAndId* out) { + ResizeParams params; + params.input_info = GetTensorInfo(tensor_table_, call); + if (auto err = EthosnAPI::Resize(call->op.as()->body, ¶ms)) { + return err; + } + + auto input = operand_table_[call->args[0]][0]; + + try { + *out = AddResize(network_, *input, params.resize_info); + } catch (const sl::NotSupportedException& e) { + return EthosnError(e.what()); + } + return EthosnError(); +} + runtime::Module EthosnCompiler::CreateRuntimeModule(const ObjectRef& ref) { std::vector cmms; if (ref->IsInstance()) { @@ -958,6 +983,20 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.requantize") err += EthosnError(reason); }); +TVM_REGISTER_GLOBAL("relay.ethos-n.support.resize") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { + Call call = args[0]; + ResizeParams params; + auto err = EthosnAPI::Resize(call, ¶ms); + err += EthosnCompiler::SupportedSetup(); + char reason[kReasonMaxLength]; + reason[0] = '\0'; + *rv = !err && + EthosnCompiler::GetSupported()->IsResizeSupported( + params.resize_info, params.input_info, ¶ms.output_info, reason, sizeof(reason)); + err += EthosnError(reason); + }); + TVM_REGISTER_GLOBAL("relay.ethos-n.query").set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { #if defined ETHOSN_HW *rv = true; diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h index 66aefab16d2d..863a032cafba 100644 --- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h +++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h @@ -212,6 +212,7 @@ class ConstructNetworkVisitor : public MixedModeVisitor, private ErrorReportingP EthosnError MakeReluLayer(const Call& call, sl::TensorAndId* out); EthosnError MakeLeakyReLULayer(const Call& call, sl::TensorAndId* out); EthosnError MakeRequantizeLayer(const Call& call, sl::TensorAndId* out); + EthosnError MakeResizeLayer(const Call& call, sl::TensorAndId* out); /*! \brief A look-up table from Expr to layers. */ std::map>> operand_table_; diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc index c1f67d0d2b16..ccca1779f6d9 100644 --- a/src/relay/backend/contrib/ethosn/ethosn_api.cc +++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc @@ -23,6 +23,7 @@ #include "ethosn_api.h" +#include #include #include #include @@ -684,6 +685,45 @@ EthosnError EthosnAPI::Requantize(const Expr& expr, RequantizeParams* params) { return err; } +EthosnError EthosnAPI::Resize(const Expr& expr, ResizeParams* params) { + Call requantize = Downcast(expr); + Call resize = Downcast(requantize->args[0]); + + const auto* input_dtype = resize->args[0]->checked_type().as(); + sl::TensorShape input_tensor_shape = {1, 1, 1, 1}; + EthosnError err = Tvm2Npu(input_dtype->shape, &input_tensor_shape); + sl::DataType input_tensor_dtype; + err += Tvm2Npu(input_dtype->dtype, &input_tensor_dtype); + float input_sc; + int input_zp; + err += AsConstant(requantize->args[2], &input_zp); + err += AsConstant(requantize->args[1], &input_sc); + sl::QuantizationInfo input_q_info; + err += Tvm2Npu(input_zp, input_sc, &input_q_info); + params->input_info = + sl::TensorInfo(input_tensor_shape, input_tensor_dtype, sl::DataFormat::NHWC, input_q_info); + + float output_sc; + int output_zp; + err += AsConstant(requantize->args[3], &output_sc); + err += AsConstant(requantize->args[4], &output_zp); + sl::QuantizationInfo resize_q_info; + err += Tvm2Npu(output_zp, output_sc, &resize_q_info); + const auto* attrs = resize->attrs.as(); + uint32_t height, width; + err += Tvm2Npu(attrs->size, &height, &width); + params->resize_info = + sl::ResizeInfo{sl::ResizeAlgorithm::NEAREST_NEIGHBOUR, height, width, resize_q_info}; + + sl::TensorInfo output_info = params->input_info; + output_info.m_Dimensions[1] = params->resize_info.m_NewHeight; + output_info.m_Dimensions[2] = params->resize_info.m_NewWidth; + output_info.m_QuantizationInfo = params->resize_info.m_OutputQuantizationInfo; + params->output_info = output_info; + + return err; +} + EthosnError EthosnAPI::Tvm2Npu(const Array& padding, sl::Padding* npu_padding) { std::array dim; if (EthosnError err = AsArray(padding, &dim)) { diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.h b/src/relay/backend/contrib/ethosn/ethosn_api.h index bb1cd29a5bc4..afe4736bfc40 100644 --- a/src/relay/backend/contrib/ethosn/ethosn_api.h +++ b/src/relay/backend/contrib/ethosn/ethosn_api.h @@ -146,6 +146,12 @@ struct RequantizeParams { sl::TensorInfo output_info; }; +struct ResizeParams { + sl::ResizeInfo resize_info; + sl::TensorInfo input_info; + sl::TensorInfo output_info; +}; + /*! * \brief A wrapper around std::stringstream to build an EthosnError. */ @@ -241,6 +247,8 @@ class EthosnAPI { static EthosnError Relu(const Expr& expr, ReluParams* params); /*! \brief Extract the Support Library requantize params from a Relay qnn.requantize call */ static EthosnError Requantize(const Expr& expr, RequantizeParams* params); + /*! \brief Extract the Support Library resize params from a Relay resize call */ + static EthosnError Resize(const Expr& expr, ResizeParams* params); private: /*! \brief Convert a TVM IndexExpr array to a SL tensor shape */ diff --git a/tests/python/contrib/test_ethosn/test_resize.py b/tests/python/contrib/test_ethosn/test_resize.py new file mode 100644 index 000000000000..b9d807d21926 --- /dev/null +++ b/tests/python/contrib/test_ethosn/test_resize.py @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Arm(R) Ethos(TM)-N integration resize tests""" + +import pytest +import numpy as np +import tvm +from tvm import relay +from tvm.testing import requires_ethosn +from . import infrastructure as tei + + +def _get_model( + shape, + dtype, + size, + input_zp, + input_sc, + output_zp, + output_sc, + coordinate_transformation_mode, + rounding_method, +): + x = relay.var("x", shape=shape, dtype=dtype) + resize = relay.image.resize2d( + data=x, + size=size, + layout="NHWC", + method="nearest_neighbor", + coordinate_transformation_mode=coordinate_transformation_mode, + rounding_method=rounding_method, + ) + model = relay.qnn.op.requantize( + resize, + input_scale=relay.const(input_sc, "float32"), + input_zero_point=relay.const(input_zp, "int32"), + output_scale=relay.const(output_sc, "float32"), + output_zero_point=relay.const(output_zp, "int32"), + out_dtype=dtype, + ) + return model + + +@requires_ethosn +@pytest.mark.parametrize("dtype", ["uint8", "int8"]) +@pytest.mark.parametrize( + "shape, size, coordinate_transformation_mode, rounding_method", + [ + ((1, 4, 4, 2), (8, 8), "half_pixel", "round_prefer_ceil"), + ((1, 4, 4, 2), (7, 7), "asymmetric", "floor"), + ((1, 4, 8, 3), (8, 16), "half_pixel", "round_prefer_ceil"), + ((1, 4, 8, 3), (7, 15), "asymmetric", "floor"), + ], +) +def test_resize(dtype, shape, size, coordinate_transformation_mode, rounding_method): + np.random.seed(0) + zp_min = np.iinfo(dtype).min + zp_max = np.iinfo(dtype).max + inputs = { + "x": tvm.nd.array(np.random.randint(zp_min, high=zp_max + 1, size=shape, dtype=dtype)), + } + outputs = [] + for npu in [False, True]: + model = _get_model( + shape=shape, + dtype=dtype, + size=size, + input_zp=zp_min + 128, + input_sc=0.0784314, + output_zp=zp_min + 128, + output_sc=0.0784314, + coordinate_transformation_mode=coordinate_transformation_mode, + rounding_method=rounding_method, + ) + mod = tei.make_module(model, {}) + x = tei.build_and_run(mod, inputs, 1, {}, npu=npu) + outputs.append(x) + + tei.verify(outputs, dtype, 1) + + +@requires_ethosn +def test_resize_failure(): + trials = [ + ( + (30, 20), + "Requested height isn't supported", + ), + ( + (20, 30), + "Requested width isn't supported", + ), + ( + (19, 20), + "Requested width and height must be both even or both odd", + ), + ( + (20, 19), + "Requested width and height must be both even or both odd", + ), + ] + dtype = "int8" + zp_min = np.iinfo(dtype).min + + for size, err_msg in trials: + model = _get_model( + shape=(1, 10, 10, 1), + dtype=dtype, + size=size, + input_zp=zp_min + 128, + input_sc=0.0784314, + output_zp=zp_min + 128, + output_sc=0.0784314, + coordinate_transformation_mode="half_pixel", + rounding_method="round_prefer_ceil", + ) + model = tei.make_ethosn_composite(model, "ethos-n.qnn_resize") + mod = tei.make_ethosn_partition(model) + tei.test_error(mod, {}, err_msg) From 1ec2c369128c9d57bb09087ab16cb3a2527dd9de Mon Sep 17 00:00:00 2001 From: wrongtest Date: Wed, 24 Aug 2022 17:44:22 +0800 Subject: [PATCH 025/704] [TIR][CompactBufferAllocation] Improve upperbound estimation of buffer compaction (#12527) Hi, this change wants to add some minor updation to region estimator used by buffer compaction: - Add and clearify among `EstimateRegionStrictBound`, `EstimateRegionLowerBound` and `EstimateRegionUpperBound` Originally we have `EstimateRegionLowerBound`, actually it implements strict bound estimation IMO. Now add `upper` and `strict` version for where we actually want them. - When estimating upperbounds (eg. in buffer compaction), try estimate each dimension independently when they are dependent accesses where `EstimateRegionLowerBound` is expected to fail. Eg, `A[i, i], 3 < i < 16` fails via `EstimateRegionLowerBound` who check indices be independent. But we can still try best to invoke strict bound analysis on each dimension individually. - If range->extent == 1 for `EvalSet(range, dom)`, invoke `EvalSet(range->min, dom)` instead. Eg, `EvalSet([k*k, k*k+1), dom_k)` results to [-inf, +inf] due to current algorithm limitation but `EvalSet(k*k, dom_k)` results to a range which makes more sense. --- include/tvm/arith/int_set.h | 39 +- python/tvm/arith/__init__.py | 8 +- python/tvm/arith/int_set.py | 48 +++ src/arith/int_set.cc | 131 +++++-- src/tir/schedule/primitive/compute_at.cc | 2 +- src/tir/schedule/state.cc | 14 +- src/tir/schedule/utils.h | 18 - src/tir/transforms/compact_buffer_region.cc | 2 +- tests/python/unittest/test_arith_intset.py | 354 ++++++++++-------- ...est_tir_transform_compact_buffer_region.py | 100 +++++ 10 files changed, 496 insertions(+), 220 deletions(-) diff --git a/include/tvm/arith/int_set.h b/include/tvm/arith/int_set.h index 7cc4efe6b012..5ef7108d9797 100644 --- a/include/tvm/arith/int_set.h +++ b/include/tvm/arith/int_set.h @@ -261,7 +261,29 @@ Array UnionRegionLowerBound(const Array>& nd_int_sets); IntSet Intersect(const Array& sets); /*! - * \brief Analyze the region with affine map, given the domain of variables and their predicate + * \brief Converts the Ranges to IntSets + * \param var_dom The ranges of variables + * \return The integer sets of the variables + */ +Map AsIntSet(const Map& var_dom); + +/*! + * \brief Analyze the region with affine map, given the domain of variables and their predicate. + * The result should be strict, i.e. no region is discarded or relaxed. + * \param region The region to be analyzed + * \param var_dom The ranges of the variables + * \param predicate The predicate for the affine map + * \param analyzer The analyzer used + * \return NullOpt if the detection fails, or an array of arith::IntSet as the result of analysis + */ +TVM_DLL Optional> EstimateRegionStrictBound(const Array& region, + const Map& var_dom, + const PrimExpr& predicate, + arith::Analyzer* analyzer); + +/*! + * \brief Analyze the region with affine map, given the domain of variables and their predicate. + * Some subregion may be discarded during the lower-bound analysis. * \param region The region to be analyzed * \param var_dom The ranges of the variables * \param predicate The predicate for the affine map @@ -273,6 +295,21 @@ TVM_DLL Optional> EstimateRegionLowerBound(const Array& reg const PrimExpr& predicate, arith::Analyzer* analyzer); +/*! + * \brief Analyze the region with affine map, given the domain of variables and their predicate + * Relaxation of the region may be used in upper-bound analysis, i.e. some extra region may be added + * to the result. + * \param region The region to be analyzed + * \param var_dom The ranges of the variables + * \param predicate The predicate for the affine map + * \param analyzer The analyzer used + * \return an array of arith::IntSet as the result of analysis + */ +TVM_DLL Array EstimateRegionUpperBound(const Array& region, + const Map& var_dom, + const PrimExpr& predicate, + arith::Analyzer* analyzer); + } // namespace arith } // namespace tvm #endif // TVM_ARITH_INT_SET_H_ diff --git a/python/tvm/arith/__init__.py b/python/tvm/arith/__init__.py index f5a0478dc008..03c0769850c9 100644 --- a/python/tvm/arith/__init__.py +++ b/python/tvm/arith/__init__.py @@ -16,7 +16,13 @@ # under the License. """Integer bound analysis, simplification and pattern detection.""" -from .int_set import IntSet, IntervalSet, estimate_region_lower_bound +from .int_set import ( + IntSet, + IntervalSet, + estimate_region_lower_bound, + estimate_region_strict_bound, + estimate_region_upper_bound, +) from .analyzer import ModularSet, ConstIntBound, Analyzer from .bound import deduce_bound from .pattern import detect_linear_equation, detect_clip_bound diff --git a/python/tvm/arith/int_set.py b/python/tvm/arith/int_set.py index b5f2100b7c7d..151461bcaf9f 100644 --- a/python/tvm/arith/int_set.py +++ b/python/tvm/arith/int_set.py @@ -83,6 +83,7 @@ def __init__(self, min_value, max_value): def estimate_region_lower_bound(region, var_dom, predicate): """Analyze the region with affine map, given the domain of variables and their predicate + Some subregion may be discarded during the lower-bound analysis. Parameters ---------- @@ -103,6 +104,53 @@ def estimate_region_lower_bound(region, var_dom, predicate): return _ffi_api.EstimateRegionLowerBound(region, var_dom, predicate) +def estimate_region_strict_bound(region, var_dom, predicate): + """Analyze the region with affine map, given the domain of variables and their predicate + The result should be strict, i.e. no region is discarded or relaxed. + + Parameters + ---------- + region : List[Range] + The region to be analyzed. + + var_dom : Dict[Var, Range] + The ranges of the variables + + predicate : PrimExpr + The predicate for the affine map + + Returns + ---------- + region_int_set : Optional[List[IntSet]] + None if the detection fails, or an array of IntSets as the result of analysis + """ + return _ffi_api.EstimateRegionStrictBound(region, var_dom, predicate) + + +def estimate_region_upper_bound(region, var_dom, predicate): + """Analyze the region with affine map, given the domain of variables and their predicate + Relaxation of the region may be used in upper-bound analysis, + i.e. some extra region may be added to the result. + + Parameters + ---------- + region : List[Range] + The region to be analyzed. + + var_dom : Dict[Var, Range] + The ranges of the variables + + predicate : PrimExpr + The predicate for the affine map + + Returns + ---------- + region_int_set : List[IntSet] + an array of IntSets as the result of analysis + """ + return _ffi_api.EstimateRegionUpperBound(region, var_dom, predicate) + + def pos_inf(): """Returns the symbolic positive infinity diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc index 584bbe8f04ea..e8e223ceca09 100644 --- a/src/arith/int_set.cc +++ b/src/arith/int_set.cc @@ -975,6 +975,9 @@ IntSet EvalSet(PrimExpr e, const std::unordered_map& dom IntSet EvalSet(Range r, const Map& dom_map) { Analyzer ana; + if ((r->min->dtype.is_int() || r->min->dtype.is_uint()) && ana.CanProveEqual(r->extent, 1)) { + return EvalSet(r->min, dom_map); + } IntervalSetEvaluator m(&ana, dom_map); // Simplifying first can give tighter bounds if r->min and r->extent share variables PrimExpr sum = r->min + r->extent - 1; @@ -1035,15 +1038,57 @@ IntSet EvalSet(Range r, const Map& dom_map) { return EvalSet(r, ConvertDomMap(dom_map)); } -Optional> EstimateRegionLowerBound(const Array& region, - const Map& var_dom, - const PrimExpr& predicate, Analyzer* analyzer) { +Map AsIntSet(const Map& var_dom) { + Map result; + for (auto kv : var_dom) { + const Var& var = kv.first; + const Range& range = kv.second; + result.Set(var, arith::IntSet::FromRange(range)); + } + return result; +} + +/*! \brief Helper function to convert IterSumExpr to the actual touched range. */ +static Optional EvalIterSum(const IterSumExpr& iter_min, const PrimExpr& extent, + Analyzer* analyzer) { + if (iter_min->args.empty()) { + return IntSet::FromMinExtent(iter_min->base, extent); + } + ICHECK_EQ(iter_min->args.size(), 1) << "The `EvalIterSum` expects fused iter sum expr"; + const IterSplitExpr& split = iter_min->args[0]; + if (!analyzer->CanProve(extent >= split->scale)) { + return NullOpt; + } + + const PrimExpr& base = iter_min->base; + // IterSplitExpr: (source // lower_factor) % extent * scale + // where `(source // lower_factor) % extent` is within [0, extent - 1] + if (analyzer->CanProve(split->scale < 0)) { + // If scale is negative, the var dom is [(extent - 1) * scale, 0] + // The total base is `base + (extent - 1) * scale`, + // while total extent is `dom_extent + (extent - 1) * (-scale)` + const PrimExpr& var_extent = (split->extent - 1) * split->scale; + return IntSet::FromMinExtent(base + var_extent, extent - var_extent); + } else { + // If scale is positive, the var dom is [0, (extent - 1) * scale] + // The total dom is [base, dom_extent + (extent - 1) * scale] + return IntSet::FromMinExtent(base, extent + (split->extent - 1) * split->scale); + } +} + +Optional> EstimateRegionStrictBound(const Array& region, + const Map& var_dom, + const PrimExpr& predicate, Analyzer* analyzer) { int ndim = region.size(); Array iter_sum_exprs{nullptr}; { Array affine_indices; affine_indices.reserve(ndim); for (const Range& range : region) { + if (!is_const_number(range->extent)) { + // dynamic extent is not supported yet. + return NullOpt; + } affine_indices.push_back(range->min); } auto res = DetectIterMap( @@ -1060,31 +1105,57 @@ Optional> EstimateRegionLowerBound(const Array& region, for (int i = 0; i < ndim; ++i) { const IterSumExpr& sum_expr = iter_sum_exprs[i]; const Range& range = region[i]; - if (sum_expr->args.empty()) { - result.push_back(IntSet::FromMinExtent(sum_expr->base, range->extent)); - continue; - } - ICHECK_EQ(sum_expr->args.size(), 1); - const IterSplitExpr& split = sum_expr->args[0]; - if (!analyzer->CanProve(range->extent >= split->scale)) { + Optional int_set = EvalIterSum(sum_expr, range->extent, analyzer); + if (int_set.defined()) { + result.push_back(int_set.value()); + } else { return NullOpt; } + } + return result; +} - const PrimExpr& base = sum_expr->base; - // IterSplitExpr: (source // lower_factor) % extent * scale - // where `(source // lower_factor) % extent` is within [0, extent - 1] - if (analyzer->CanProve(split->scale < 0)) { - // If scale is negative, the var dom is [(extent - 1) * scale, 0] - // The total base is `base + (extent - 1) * scale`, - // while total extent is `dom_extent + (extent - 1) * (-scale)` - const PrimExpr& var_extent = (split->extent - 1) * split->scale; - result.push_back(IntSet::FromMinExtent(base + var_extent, range->extent - var_extent)); - } else { - // If scale is positive, the var dom is [0, (extent - 1) * scale] - // The total dom is [base, dom_extent + (extent - 1) * scale] - result.push_back( - IntSet::FromMinExtent(base, range->extent + (split->extent - 1) * split->scale)); +Optional> EstimateRegionLowerBound(const Array& region, + const Map& var_dom, + const PrimExpr& predicate, + arith::Analyzer* analyzer) { + return EstimateRegionStrictBound(region, var_dom, predicate, analyzer); +} + +Array EstimateRegionUpperBound(const Array& region, const Map& var_dom, + const PrimExpr& predicate, Analyzer* analyzer) { + if (Optional> result = EstimateRegionStrictBound( + /*region=*/region, + /*var_dom=*/var_dom, + /*predicate=*/predicate, /*analyzer=*/analyzer)) { + return result.value(); + } + Array result; + result.reserve(region.size()); + // try estimate each dimension independently + for (const Range& range : region) { + auto res = DetectIterMap( + /*indices=*/{range->min}, /*input_iters=*/var_dom, + /*predicate=*/predicate, /*check_level=*/IterMapLevel::Surjective, analyzer); + if (!res->indices.empty()) { + ICHECK_EQ(res->indices.size(), 1U); + IterSumExpr sum_expr = res->indices[0]; + + // dynamic extent is not supported yet. + PrimExpr extent = range->extent; + if (!is_const_number(extent)) { + IntSet relaxed = EvalSet(extent, AsIntSet(var_dom)); + ICHECK(relaxed.HasUpperBound()); + extent = relaxed.max(); + } + + if (Optional int_set = EvalIterSum(sum_expr, range->extent, analyzer)) { + result.push_back(int_set.value()); + continue; + } } + // fallback to coarse grained evalset + result.push_back(EvalSet(range, AsIntSet(var_dom))); } return result; } @@ -1118,6 +1189,18 @@ TVM_REGISTER_GLOBAL("arith.EstimateRegionLowerBound") Analyzer analyzer; return EstimateRegionLowerBound(region, var_dom, predicate, &analyzer); }); +TVM_REGISTER_GLOBAL("arith.EstimateRegionStrictBound") + .set_body_typed([](Array region, Map var_dom, + PrimExpr predicate) -> Optional> { + Analyzer analyzer; + return EstimateRegionStrictBound(region, var_dom, predicate, &analyzer); + }); +TVM_REGISTER_GLOBAL("arith.EstimateRegionUpperBound") + .set_body_typed([](Array region, Map var_dom, + PrimExpr predicate) -> Optional> { + Analyzer analyzer; + return EstimateRegionUpperBound(region, var_dom, predicate, &analyzer); + }); TVM_REGISTER_GLOBAL("arith.PosInf").set_body_typed([]() { return SymbolicLimits::pos_inf_; }); TVM_REGISTER_GLOBAL("arith.NegInf").set_body_typed([]() { return SymbolicLimits::neg_inf_; }); diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc index 7b0d749f03dc..98a6b2400ee3 100644 --- a/src/tir/schedule/primitive/compute_at.cc +++ b/src/tir/schedule/primitive/compute_at.cc @@ -356,7 +356,7 @@ void RelaxBufferRegions(const Map& binding, runtime::StorageRank rank = scope.rank; if (rank != previous_rank || !var_dom.defined()) { previous_rank = rank; - var_dom = AsIntSet(LoopDomainOfSRefTreePath( + var_dom = arith::AsIntSet(LoopDomainOfSRefTreePath( /*low_inclusive=*/relax_path_low_inclusive, /*high_exclusive=*/relax_path_high_exclusive, /*extra_relax_scope=*/scope)); diff --git a/src/tir/schedule/state.cc b/src/tir/schedule/state.cc index dadabba48540..07481ddb19e3 100644 --- a/src/tir/schedule/state.cc +++ b/src/tir/schedule/state.cc @@ -16,8 +16,9 @@ * specific language governing permissions and limitations * under the License. */ -#include "./utils.h" +#include +#include "./utils.h" namespace tvm { namespace tir { @@ -44,13 +45,10 @@ Array AnalyzeRegionUpperBound(const BufferRegion& region, /*low_inclusive=*/dom_low_inclusive, /*high_exclusive=*/dom_high_exclusive, /*extra_relax_scope=*/runtime::StorageScope::Create(region->buffer.scope())); - if (Optional> result = EstimateRegionLowerBound( - /*region=*/region->region, - /*var_dom=*/var_dom, - /*predicate=*/predicate, /*analyzer=*/analyzer)) { - return result.value(); - } - return arith::EvalSet(region->region, AsIntSet(var_dom)); + return EstimateRegionUpperBound( + /*region=*/region->region, + /*var_dom=*/var_dom, + /*predicate=*/predicate, /*analyzer=*/analyzer); } /*! diff --git a/src/tir/schedule/utils.h b/src/tir/schedule/utils.h index 53cafa798b54..3db80989ae10 100644 --- a/src/tir/schedule/utils.h +++ b/src/tir/schedule/utils.h @@ -249,24 +249,6 @@ inline bool IsThreadIdx(const runtime::ThreadScope& thread_scope) { return thread_scope.rank == 1 && thread_scope.dim_index >= 0; } -/******** Integer set ********/ - -/*! - * \brief Converts the Ranges to IntSets - * \param var_dom The ranges of variables - * \return The integer sets of the variables - */ -inline Map AsIntSet(const Map& var_dom) { - std::unordered_map result; - result.reserve(var_dom.size()); - for (auto kv : var_dom) { - Var& var = kv.first; - Range& range = kv.second; - result.emplace(std::move(var), arith::IntSet::FromRange(std::move(range))); - } - return {result.begin(), result.end()}; -} - /**************** Loop extents ****************/ /*! diff --git a/src/tir/transforms/compact_buffer_region.cc b/src/tir/transforms/compact_buffer_region.cc index 2844f1b35e9e..249b8cca77b0 100644 --- a/src/tir/transforms/compact_buffer_region.cc +++ b/src/tir/transforms/compact_buffer_region.cc @@ -88,7 +88,7 @@ NDIntSet NDIntSetEval(Region region, PrimExpr predicate, var_dom[GetRef(it.first)] = it.second.CoverRange(Range::FromMinExtent(0, 0)); } Optional> eval_res = - arith::EstimateRegionLowerBound(region, var_dom, predicate, analyzer); + arith::EstimateRegionUpperBound(region, var_dom, predicate, analyzer); if (eval_res.defined()) { return NDIntSet(eval_res.value().begin(), eval_res.value().end()); } diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py index 2302d0ed54f2..24228fb52703 100644 --- a/tests/python/unittest/test_arith_intset.py +++ b/tests/python/unittest/test_arith_intset.py @@ -15,9 +15,10 @@ # specific language governing permissions and limitations # under the License. import tvm +import tvm.testing from tvm import te from tvm import tir -from tvm.ir.base import structural_equal +from tvm.arith.analyzer import Analyzer class IntSetChecker: @@ -128,66 +129,139 @@ def test_select(): ck.verify(tvm.tir.Select(x > 0, x - 1, x + 1), {x: tvm.arith.IntervalSet(0, 10)}, (-1, 11)) -def test_region_lower_bound_not_independent(): +def check_region_bound(expect_region, var_dom, mode, predicate=None): + """Helper to check region bound estimation. + + Parameters + ---------- + expect_region: dict + The keys are of form (begin, end) or PrimExpr as a single point. The values are + expected estimated region or region dict on different bindings. + + var_dom: dict + Map var to iteration domain range. + + mode: str + Specify "lowerbound", "upperbound" or else use strict bound estimation. + + predicate: PrimExpr + Extra predicate, defaults to True. + """ + if predicate is None: + predicate = tvm.tir.IntImm("bool", 1) + region = [] + expect = [] + for k, v in expect_region.items(): + if not isinstance(k, (tuple, list)): + k = (k, k + 1) + region.append(tvm.ir.Range.from_min_extent(k[0], Analyzer().simplify(k[1] - k[0]))) + expect.append(v) + if mode == "lowerbound": + result = tvm.arith.estimate_region_lower_bound( + region=region, var_dom=var_dom, predicate=predicate + ) + elif mode == "upperbound": + result = tvm.arith.estimate_region_upper_bound( + region=region, var_dom=var_dom, predicate=predicate + ) + else: + result = tvm.arith.estimate_region_strict_bound( + region=region, var_dom=var_dom, predicate=predicate + ) + if result is None: + assert all([_ is None for _ in expect]) + return + assert len(result) == len(expect) + for intset, expect_desc in zip(result, expect): + if isinstance(expect_desc, dict): + # check range on different free var bindings + for binding in expect_desc: + analyzer = Analyzer() + for k, v in binding: + analyzer.bind(k, v) + expect_begin, expect_end = expect_desc[binding] + result_begin = analyzer.simplify(intset.min_value, 3) + result_end = analyzer.simplify(intset.max_value + 1, 3) + print(result_end) + assert analyzer.can_prove_equal( + result_begin - expect_begin, 0 + ), f"{result_begin} vs {expect_begin}" + assert analyzer.can_prove_equal( + result_end - expect_end, 0 + ), f"{result_end} vs {expect_end}" + else: + # check range + expect_begin, expect_end = expect_desc + analyzer = Analyzer() + assert analyzer.can_prove_equal( + intset.min_value - expect_begin, 0 + ), f"{intset.min_value} vs {expect_begin}" + assert analyzer.can_prove_equal( + intset.max_value - expect_end + 1, 0 + ), f"{intset.max_value} vs {expect_end - 1}" + + +def test_region_bound_not_independent(): + # (i, i+2) and (i+2, i+4) are dependent, this the lowerbound is not available i = tvm.tir.Var("i", "int32") - result = tvm.arith.estimate_region_lower_bound( - region=[ - tvm.ir.Range(begin=i, end=i + 2), - tvm.ir.Range(begin=i + 1, end=i + 4), - ], - var_dom={ - i: tvm.ir.Range(begin=0, end=64), - }, - predicate=tvm.tir.IntImm("bool", 1), + var_dom = { + i: tvm.ir.Range(begin=0, end=64), + } + check_region_bound({(i, i + 2): None, (i + 2, i + 4): None}, var_dom, mode="lowerbound") + check_region_bound({(i, i + 2): (0, 65), (i + 2, i + 4): (2, 67)}, var_dom, mode="upperbound") + + # when only a subset of access indices are affine + i, j, k = tvm.tir.Var("i", "int32"), tvm.tir.Var("j", "int32"), tvm.tir.Var("k", "int32") + var_dom = { + i: tvm.ir.Range(begin=0, end=16), + j: tvm.ir.Range(begin=0, end=16), + k: tvm.ir.Range(begin=0, end=16), + } + check_region_bound( + {i // 4: None, j * 4 + i % 4: None, tir.truncdiv(k, 2): None}, + var_dom, + predicate=j * 4 + i % 4 > 3, + mode="lowerbound", + ) + check_region_bound( + {i // 4: (0, 4), j * 4 + i % 4: (4, 64), tir.truncdiv(k, 2): (0, 8)}, + var_dom, + predicate=j * 4 + i % 4 > 3, + mode="upperbound", ) - assert result is None -def test_region_lower_bound_stride_too_wide(): +def test_region_bound_stride_too_wide(): i = tvm.tir.Var("i", "int32") - result = tvm.arith.estimate_region_lower_bound( - region=[ - tvm.ir.Range(begin=i * 4, end=i * 4 + 2), - ], - var_dom={ - i: tvm.ir.Range(begin=0, end=64), - }, - predicate=tvm.tir.IntImm("bool", 1), - ) - assert result is None + var_dom = {i: tvm.ir.Range(begin=0, end=64)} + check_region_bound({(i * 4, i * 4 + 2): None}, var_dom, mode="lowerbound") + check_region_bound({(i * 4, i * 4 + 2): (0, 254)}, var_dom, mode="upperbound") -def test_region_lower_bound_small_stride(): +def test_region_bound_small_stride(): i = tvm.tir.Var("i", "int32") - (result,) = tvm.arith.estimate_region_lower_bound( - region=[ - tvm.ir.Range.from_min_extent(min_value=i * 4, extent=8), - ], - var_dom={ - i: tvm.ir.Range(begin=0, end=64), - }, - predicate=tvm.tir.IntImm("bool", 1), - ) - assert result.min_value.value == 0 - assert result.max_value.value == 259 + var_dom = { + i: tvm.ir.Range(begin=0, end=64), + } + check_region_bound({(i * 4, i * 4 + 8): (0, 260)}, var_dom, mode="lowerbound") def test_region_lower_bound_split_predicate(): x_o = tvm.tir.Var("xo", "int32") x_i = tvm.tir.Var("xi", "int32") x = x_o * 4 + x_i - (result,) = tvm.arith.estimate_region_lower_bound( - region=[ - tvm.ir.Range.from_min_extent(min_value=x * 4, extent=8), - ], - var_dom={ - x_o: tvm.ir.Range(begin=0, end=16), - x_i: tvm.ir.Range(begin=0, end=4), - }, + var_dom = { + x_o: tvm.ir.Range(begin=0, end=16), + x_i: tvm.ir.Range(begin=0, end=4), + } + check_region_bound({(x * 4, x * 4 + 8): (0, 256)}, var_dom, predicate=x < 63, mode="lowerbound") + + check_region_bound( + {(x * 4, x * 4 + 8): (0, 256), (x * 3, x * 3 + 5): (0, 191)}, + var_dom, predicate=x < 63, + mode="upperbound", ) - assert result.min_value.value == 0 - assert result.max_value.value == 255 def test_region_lower_bound_multiple_variables(): @@ -198,127 +272,94 @@ def test_region_lower_bound_multiple_variables(): i = div(x, 16) j = div(mod(x, 16), 4) * 8 + mod(x, 4) + div(wid, 32) * 4 k = wid % 32 - (i_int_set, j_int_set, k_int_set) = tvm.arith.estimate_region_lower_bound( - region=[ - tvm.ir.Range.from_min_extent(min_value=i, extent=1), - tvm.ir.Range.from_min_extent(min_value=j, extent=1), - tvm.ir.Range.from_min_extent(min_value=k, extent=1), - ], - var_dom={ - x: tvm.ir.Range(begin=0, end=32), - wid: tvm.ir.Range(begin=0, end=64), - }, - predicate=tvm.tir.IntImm("bool", 1), - ) - assert i_int_set.min_value.value == 0 - assert i_int_set.max_value.value == 1 - assert j_int_set.min_value.value == 0 - assert j_int_set.max_value.value == 31 - assert k_int_set.min_value.value == 0 - assert k_int_set.max_value.value == 31 + var_dom = { + x: tvm.ir.Range(begin=0, end=32), + wid: tvm.ir.Range(begin=0, end=64), + } + check_region_bound({i: (0, 2), j: (0, 32), k: (0, 32)}, var_dom, mode="lowerbound") def test_region_lower_bound_negative_scale(): i = tvm.tir.Var("i", "int32") j = tvm.tir.Var("j", "int32") - int_set_0, int_set_1 = tvm.arith.estimate_region_lower_bound( - region=[ - tvm.ir.Range.from_min_extent(min_value=1 - i, extent=4), - tvm.ir.Range.from_min_extent(min_value=20 - j * 4, extent=16), - ], - var_dom={ - i: tvm.ir.Range(begin=0, end=4), - j: tvm.ir.Range(begin=0, end=4), - }, - predicate=tvm.tir.IntImm("bool", 1), + var_dom = { + i: tvm.ir.Range(begin=0, end=4), + j: tvm.ir.Range(begin=0, end=4), + } + check_region_bound( + {(1 - i, 5 - i): (-2, 5), (20 - j * 4, 36 - j * 4): (8, 36)}, var_dom, mode="lowerbound" ) - assert int_set_0.min_value.value == -2 - assert int_set_0.max_value.value == 4 - assert int_set_1.min_value.value == 8 - assert int_set_1.max_value.value == 35 def test_region_lower_bound_for_non_perfect_tile(): h1 = tvm.tir.Var("h1", "int32") h2 = tvm.tir.Var("h2", "int32") h3 = tvm.tir.Var("h3", "int32") - analyzer = tvm.arith.Analyzer() - - def do_test_point_access(point, predicates, var_dom, expect): - regions = tvm.arith.estimate_region_lower_bound( - region=[ - tvm.ir.Range.from_min_extent(min_value=point, extent=1), - ], - var_dom=var_dom, - predicate=tvm.tir.all(*predicates), - ) - if expect is None: # expect a failure - assert regions is None - else: - assert len(regions) == 1 - for binding, expect_min, expect_max in expect: - min_diff = expect_min - regions[0].min_value - assert analyzer.simplify(tir.stmt_functor.substitute(min_diff, binding), 3) == 0 - max_diff = expect_max - regions[0].max_value - assert analyzer.simplify(tir.stmt_functor.substitute(max_diff, binding), 3) == 0 # non-uniform tiling, single inner variable - # h3 == 0: region is [1, 9] - # 0 < h3 <= 26: region is [h3 * 8, h3 * 8 + 9] - # h3 > 26: region is [h3 * 8, 223] - do_test_point_access( - point=h3 * 8 + h2, - predicates=[1 <= h3 * 8 + h2, h3 * 8 + h2 < 224], - var_dom={ - h2: tvm.ir.Range(begin=0, end=10), + var_dom = { + h2: tvm.ir.Range(begin=0, end=10), + } + check_region_bound( + { + h3 * 8 + + h2: { + (): ( + tvm.tir.max(h3 * 8, 1), + tvm.tir.max(h3 * 8, 1) + - tvm.tir.max(h3 * 8, 214) + - tvm.tir.max(1 - h3 * 8, 0) + + 224, + ), + ((h3, 0),): (1, 10), # h3 == 0: region is [1, 10) + ((h3, 10),): (h3 * 8, h3 * 8 + 10), # 0 < h3 <= 26: region is [h3 * 8, h3 * 8 + 10) + ((h3, 27),): (h3 * 8, 224), # h3 > 26: region is [h3 * 8, 224) + } }, - expect=[ - ( - {}, - tvm.tir.max(h3 * 8, 1), - tvm.tir.max(h3 * 8, 1) - - tvm.tir.max(h3 * 8, 214) - - tvm.tir.max(1 - h3 * 8, 0) - + 223, - ), - ({h3: 0}, 1, 9), - ({h3: 10}, h3 * 8, h3 * 8 + 9), - ({h3: 27}, h3 * 8, 223), - ], + var_dom, + predicate=tvm.tir.all(1 <= h3 * 8 + h2, h3 * 8 + h2 < 224), + mode="lowerbound", ) # non-uniform tiling, two inner variables - do_test_point_access( - point=h3 * 8 + h2 * 5 + h1, - predicates=[1 <= h3 * 8 + h2 * 5 + h1, h3 * 8 + h2 * 5 + h1 < 224], - var_dom={ - h2: tvm.ir.Range(begin=0, end=2), - h1: tvm.ir.Range(begin=0, end=5), + var_dom = { + h1: tvm.ir.Range(begin=0, end=5), + h2: tvm.ir.Range(begin=0, end=2), + } + check_region_bound( + { + h3 * 8 + + h2 * 5 + + h1: { + (): ( + tvm.tir.max(h3 * 8, 1), + tvm.tir.max(h3 * 8, 1) + - tvm.tir.max(h3 * 8, 214) + - tvm.tir.max(1 - h3 * 8, 0) + + 224, + ), + ((h3, 0),): (1, 10), + ((h3, 10),): (h3 * 8, h3 * 8 + 10), + ((h3, 27),): (h3 * 8, 224), + } }, - expect=[ - ( - {}, - tvm.tir.max(h3 * 8, 1), - tvm.tir.max(h3 * 8, 1) - - tvm.tir.max(h3 * 8, 214) - - tvm.tir.max(1 - h3 * 8, 0) - + 223, - ), - ({h3: 0}, 1, 9), - ({h3: 10}, h3 * 8, h3 * 8 + 9), - ({h3: 27}, h3 * 8, 223), - ], + var_dom, + predicate=tvm.tir.all(1 <= h3 * 8 + h2 * 5 + h1, h3 * 8 + h2 * 5 + h1 < 224), + mode="lowerbound", ) - # should fail on incompatible predicates - do_test_point_access( - point=h3 * 8 + h2 * 5 + h1, - predicates=[1 <= h3 * 8 + h2 * 5 + h1, h3 * 8 + h1 * 2 + h2 < 224], - var_dom={ - h2: tvm.ir.Range(begin=0, end=2), - h1: tvm.ir.Range(begin=0, end=5), - }, - expect=None, + # lowerbound should fail on incompatible predicates + check_region_bound( + {h3 * 8 + h2 * 5 + h1: None}, + var_dom, + predicate=tvm.tir.all(1 <= h3 * 8 + h2 * 5 + h1, h3 * 8 + h1 * 2 + h2 < 224), + mode="lowerbound", + ) + check_region_bound( + {h3 * 8 + h2 * 5 + h1: (h3 * 8, h3 * 8 + 10)}, + var_dom, + predicate=tvm.tir.all(1 <= h3 * 8 + h2 * 5 + h1, h3 * 8 + h1 * 2 + h2 < 224), + mode="upperbound", ) @@ -328,12 +369,7 @@ def test_region_lower_bound_unfusable(): tvm.tir.Var("j", "int32"): tvm.ir.Range(4), } i, j = var_dom - region = [ - tvm.ir.Range.from_min_extent((i + j) // 2, 1), - ] - result = tvm.arith.estimate_region_lower_bound(region, var_dom, predicate=True) - assert result[0].min_value == 0 - assert result[0].max_value == 5 + check_region_bound({(i + j) // 2: (0, 6)}, var_dom, mode="lowerbound") def test_union_lower_bound(): @@ -347,18 +383,4 @@ def test_union_lower_bound(): if __name__ == "__main__": - test_basic() - test_vector() - test_add_sub() - test_mul_div() - test_max_min() - test_select() - test_mod() - test_region_lower_bound_not_independent() - test_region_lower_bound_stride_too_wide() - test_region_lower_bound_small_stride() - test_region_lower_bound_split_predicate() - test_region_lower_bound_multiple_variables() - test_region_lower_bound_negative_scale() - test_region_lower_bound_for_non_perfect_tile() - test_union_lower_bound() + tvm.testing.main() diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py index 31bb9b8b7cdb..049de0bed4f9 100644 --- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py +++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py @@ -909,5 +909,105 @@ def compacted_func(A: T.Buffer[(960, 770), "float32"], B: T.Buffer[(770, 2304), _check(func, compacted_func) +def test_compact_dependent_buffer_indices(): + """Check the upper bound on different indices could be independently estimated.""" + + @T.prim_func + def diagonal_access(): + for i in range(8): + with T.block(): + A = T.alloc_buffer((256, 256), "float32") + for j, k in T.grid(8, 8): + with T.block(): + T.where(j * 8 + k < 60) + A[i * 64 + j * 8 + k, i * 64 + j * 8 + k] = 1.0 + + @T.prim_func + def diagonal_access_compacted() -> None: + for i in T.serial(8): + with T.block(): + A = T.alloc_buffer([60, 60], dtype="float32") + for j, k in T.grid(8, 8): + with T.block(): + T.where(j * 8 + k < 60) + A[j * 8 + k, j * 8 + k] = 1.0 + + _check(diagonal_access, diagonal_access_compacted) + + +def test_compact_dependent_buffer_indices_of_packed_matmul(): + """Check the outer dimension of the packed M-dim should be compacted to 1 wrt split condition.""" + + @T.prim_func + def nonuniform_packed_matmul_write_cache( + A: T.Buffer[(1020, 64), "float32"], + B: T.Buffer[(1000, 64), "float32"], + C: T.Buffer[(1020, 1000), "float32"], + ): + for i0, i1 in T.grid(4, 1): + with T.block(): + C_local2 = T.alloc_buffer([4, 1, 16, 1000, 16], dtype="float32", scope="local") + C_local1 = T.alloc_buffer([1020, 1000], dtype="float32", scope="local") + for ax0, ax1, ax2 in T.grid(255, 1000, 64): + with T.block("matmul"): + if ax2 == 0: + C_local1[i0 * 255 + ax0, ax1] = 0 + C_local1[i0 * 255 + ax0, ax1] = ( + C_local1[i0 * 255 + ax0, ax1] + A[i0 * 255 + ax0, ax2] * B[ax1, ax2] + ) + for ax0, ax1 in T.grid(255, 1000): + with T.block("st1"): + C_local2[ + (i0 * 255 + ax0) // 255, + 0, + (i0 * 255 + ax0) % 255 // 16, + ax1, + (i0 * 255 + ax0) % 255 % 16, + ] = C_local1[i0 * 255 + ax0, ax1] + for ax0, ax1, ax2 in T.grid(16, 16, 1000): + with T.block("st2"): + T.where(ax0 * 16 + ax1 < 255) + C[i0 * 255 + (ax0 * 16 + ax1), i1 * 1000 + ax2] = C_local2[ + (i0 * 255 + ax0 * 16 + ax1) // 255, + 0, + (i0 * 255 + ax0 * 16 + ax1) % 255 // 16, + i1 * 1000 + ax2, + (i0 * 255 + ax0 * 16 + ax1) % 255 % 16, + ] + + @T.prim_func + def nonuniform_packed_matmul_write_cache_compacted( + A: T.Buffer[(1020, 64), "float32"], + B: T.Buffer[(1000, 64), "float32"], + C: T.Buffer[(1020, 1000), "float32"], + ) -> None: + for i0, i1 in T.grid(4, 1): + with T.block(): + C_local2 = T.alloc_buffer([1, 1, 15, 1000, 16], dtype="float32", scope="local") + C_local1 = T.alloc_buffer([255, 1000], dtype="float32", scope="local") + for ax0, ax1, ax2 in T.grid(255, 1000, 64): + with T.block("matmul"): + if ax2 == 0: + C_local1[ax0, ax1] = 0 + C_local1[ax0, ax1] = ( + C_local1[ax0, ax1] + A[i0 * 255 + ax0, ax2] * B[ax1, ax2] + ) + for ax0, ax1 in T.grid(255, 1000): + with T.block("st1"): + C_local2[0, 0, ax0 // 16, ax1, ax0 % 16] = C_local1[ax0, ax1] + for ax0, ax1, ax2 in T.grid(16, 16, 1000): + with T.block("st2"): + T.where(ax0 * 16 + ax1 < 255) + C[i0 * 255 + ax0 * 16 + ax1, ax2] = C_local2[ + (ax0 * 16 + ax1) // 255, + 0, + (ax0 * 16 + ax1) % 255 // 16, + ax2, + (ax0 * 16 + ax1) % 255 % 16, + ] + + _check(nonuniform_packed_matmul_write_cache, nonuniform_packed_matmul_write_cache_compacted) + + if __name__ == "__main__": tvm.testing.main() From 592148abf6866a41eefa736efca067d42f5aea86 Mon Sep 17 00:00:00 2001 From: Christopher Sidebottom Date: Wed, 24 Aug 2022 11:24:05 +0100 Subject: [PATCH 026/704] [Target] Replace IsaAnalyzer with Target Features (#12322) This is clean up to use the new `target.features` instead of `IsaAnalyzer`. --- python/tvm/relay/op/strategy/arm_cpu.py | 17 ++++------- python/tvm/target/arm_isa.py | 39 ------------------------- tests/micro/zephyr/test_zephyr.py | 4 +-- 3 files changed, 6 insertions(+), 54 deletions(-) delete mode 100644 python/tvm/target/arm_isa.py diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 54e1c871f504..ba28b6c7c31c 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -24,7 +24,6 @@ from ....auto_scheduler import is_auto_scheduler_enabled from ....meta_schedule import is_meta_schedule_enabled -from ....target import arm_isa from ....topi.generic import conv2d as conv2d_generic from .. import op as _op from .generic import * @@ -57,15 +56,14 @@ def schedule_concatenate_arm_cpu(_, outs, target): def schedule_pool_arm_cpu(attrs, outs, target): """schedule pooling ops arm cpu""" layout = attrs.layout - isa = arm_isa.IsaAnalyzer(target) avg_pool = isinstance(attrs, relay.op.op_attrs.AvgPool2DAttrs) with target: if ( avg_pool - and isa.has_dsp_support + and target.features.has_dsp and layout in ("NCW", "NCHW") or not avg_pool - and isa.has_dsp_support + and target.features.has_dsp and layout in ("NWC", "NHWC") ): return topi.arm_cpu.schedule_pool(outs, layout) @@ -87,8 +85,6 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") - isa = arm_isa.IsaAnalyzer(target) - if groups == 1: if layout == "NCHW": if kernel_layout == "OIHW": @@ -163,7 +159,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): name="conv2d_hwcn.generic", ) elif layout == "NHWC": - if isa.has_dsp_support and kernel_layout == "HWOI": + if target.features.has_dsp and kernel_layout == "HWOI": strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_dsp), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_dsp), @@ -473,10 +469,9 @@ def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target): def schedule_dense_arm_cpu(attrs, inputs, out_type, target): """dense arm cpu strategy""" strategy = _op.OpStrategy() - isa = arm_isa.IsaAnalyzer(target) data, _ = inputs - if isa.has_dsp_support and data.dtype in ["int8", "int16"]: + if target.features.has_dsp and data.dtype in ["int8", "int16"]: strategy.add_implementation( wrap_compute_dense(topi.arm_cpu.dense_dsp), wrap_topi_schedule(topi.arm_cpu.schedule_dense_dsp), @@ -506,10 +501,8 @@ def conv1d_strategy_arm_cpu(attrs, inputs, out_type, target): if dilation[0] < 1: raise ValueError("dilation should be a positive value") - isa = arm_isa.IsaAnalyzer(target) - if kernel_layout == "WOI": - if layout == "NWC" and isa.has_dsp_support: + if layout == "NWC" and target.features.has_dsp: strategy.add_implementation( wrap_compute_conv1d(topi.arm_cpu.conv1d_nwc_dsp), wrap_topi_schedule(topi.arm_cpu.schedule_conv1d_nwc_dsp), diff --git a/python/tvm/target/arm_isa.py b/python/tvm/target/arm_isa.py deleted file mode 100644 index a5ac9b1563a5..000000000000 --- a/python/tvm/target/arm_isa.py +++ /dev/null @@ -1,39 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Defines functions to analyze available opcodes in the ARM ISA.""" - -import tvm.target - - -ARM_MPROFILE_DSP_SUPPORT_LIST = [ - "cortex-m7", - "cortex-m4", - "cortex-m33", - "cortex-m35p", - "cortex-m55", -] - - -class IsaAnalyzer(object): - """Checks ISA support for given target""" - - def __init__(self, target): - self.target = tvm.target.Target(target) - - @property - def has_dsp_support(self): - return self.target.mcpu is not None and self.target.mcpu in ARM_MPROFILE_DSP_SUPPORT_LIST diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py index 9c0c3fefb488..8d9a73704d8e 100644 --- a/tests/micro/zephyr/test_zephyr.py +++ b/tests/micro/zephyr/test_zephyr.py @@ -32,7 +32,6 @@ from tvm.relay.testing import byoc from tvm.contrib import utils from tvm.micro.testing.utils import check_tune_log -from tvm.target import arm_isa import test_utils @@ -549,8 +548,7 @@ def test_schedule_build_with_cmsis_dependency( build_config = {"debug": microtvm_debug} target = tvm.target.target.micro(model, options=["-keys=arm_cpu,cpu"]) - isa = arm_isa.IsaAnalyzer(target) - if not isa.has_dsp_support: + if not target.features.has_dsp: pytest.skip(f"ISA does not support DSP. target: {target}") # Create a Relay conv2d From 6e79f64108b26b504089354cfce5e182001a70d1 Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Wed, 24 Aug 2022 13:44:55 +0100 Subject: [PATCH 027/704] [CI] Set test python.contrib.test_onnx.test_resize as xfail (#12568) `python.contrib.test_onnx.test_resize` is failing due to a numerical accuracy issue, reported in #12567. This patch marks that test as an xfail, so that other tests can be enabled, while this one is investigated separately. --- tests/python/contrib/test_onnx.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python/contrib/test_onnx.py b/tests/python/contrib/test_onnx.py index 214166cebb9d..afebc2295a68 100644 --- a/tests/python/contrib/test_onnx.py +++ b/tests/python/contrib/test_onnx.py @@ -655,6 +655,7 @@ def verify_cast(dshape, dtype): verify_cast(i, o_dtype) +@pytest.mark.xfail(reason="Known failing test. See issue #12567.") def test_resize(): """Resize unit test.""" From a0fe74b3c3608929b21faeaea422ac09aa2f75eb Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Wed, 24 Aug 2022 13:45:57 +0100 Subject: [PATCH 028/704] [ETHOSN] Support multiply conversion to depthwise (#12403) Multiply can be supported when offloaded to the NPU by a conversion to a depthwise convolution operation. This is only supported when the multiply operation has a single single variable input with the other being a constant of shape [1, ..., C]. This commit adds a new pass "ConvertEquivalents" (name subject to change) to handle this conversion before codegen. --- python/tvm/relay/op/contrib/_ethosn.py | 1 + python/tvm/relay/op/contrib/ethosn.py | 80 ++++++-- .../contrib/ethosn/convert_equivalent.cc | 144 +++++++++++++ src/relay/op/make_op.h | 2 + src/relay/qnn/utils.h | 4 + src/relay/transforms/pattern_utils.h | 34 +++ .../test_ethosn/test_convert_equivalents.py | 142 +++++++++++++ .../contrib/test_ethosn/test_multiply.py | 193 ++++++++++++++++++ 8 files changed, 582 insertions(+), 18 deletions(-) create mode 100644 src/relay/backend/contrib/ethosn/convert_equivalent.cc create mode 100644 tests/python/contrib/test_ethosn/test_convert_equivalents.py create mode 100644 tests/python/contrib/test_ethosn/test_multiply.py diff --git a/python/tvm/relay/op/contrib/_ethosn.py b/python/tvm/relay/op/contrib/_ethosn.py index ea2915675ec6..9c7c922fdfb0 100644 --- a/python/tvm/relay/op/contrib/_ethosn.py +++ b/python/tvm/relay/op/contrib/_ethosn.py @@ -20,3 +20,4 @@ import tvm._ffi tvm._ffi._init_api("relay.ethos-n.support", __name__) +tvm._ffi._init_api("relay.backend.contrib.ethos-n", __name__) diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py index 469939ecf0b8..73dd6b735775 100644 --- a/python/tvm/relay/op/contrib/ethosn.py +++ b/python/tvm/relay/op/contrib/ethosn.py @@ -25,7 +25,7 @@ from tvm.relay.build_module import bind_params_by_name from ...dataflow_pattern import is_constant, is_op, wildcard -from . import _ethosn as support +from . import _ethosn from .register import register_pattern_table @@ -60,6 +60,18 @@ def ethosn_api_version() -> str: return tvm.get_global_func("relay.ethos-n.api.version")() +def ConvertEquivalents() -> tvm.ir.IRModule: # pylint: disable=invalid-name + """Converts operations into a numerically equivalent form + that can be understood by the NPU codegen. + + Return + ------ + Pass + The module pass. + """ + return _ethosn.ConvertEquivalents() + + def partition_for_ethosn(mod, params=None, **opts): """Partition the graph greedily offloading supported operators to Arm Ethos-N NPU. @@ -107,9 +119,9 @@ def partition_for_ethosn(mod, params=None, **opts): transform.AnnotateTarget("ethos-n"), transform.MergeCompilerRegions(), transform.PartitionGraph(), + ConvertEquivalents(), ] ) - return seq(mod) @@ -183,70 +195,102 @@ def qnn_resize_pattern(): ) return pattern + def qnn_mul_pattern(): + """ + Multiply is supported when one input is a constant of shape [1, ..., C], + where C matches the number of channels of the other input. + """ + mul_op = is_op("qnn.mul") + gen_mul_inputs = lambda x, y: mul_op( + x, + y, + is_constant(), + is_constant(), + is_constant(), + is_constant(), + is_constant(), + is_constant(), + ) + input_is_left = gen_mul_inputs(wildcard(), is_constant()) + input_is_right = gen_mul_inputs(is_constant(), wildcard()) + return input_is_left | input_is_right + def check_conv2d(extract): """Check if a conv2d is supported by Ethos-N.""" if not ethosn_available(): return False - return support.conv2d(extract) + return _ethosn.conv2d(extract) def check_fc(extract): """Check if a fully connected is supported by Ethos-N.""" if not ethosn_available(): return False - return support.fc(extract) + return _ethosn.fc(extract) def check_avg_pool2d(extract): """Check if a avg pool2d is supported by Ethos-N.""" if not ethosn_available(): return False - return support.avg_pool2d(extract) + return _ethosn.avg_pool2d(extract) def check_mean(extract): """Check if mean is supported by Ethos-N.""" if not ethosn_available(): return False - return support.mean(extract) + return _ethosn.mean(extract) def check_sigmoid(extract): """Check if a sigmoid is supported by Ethos-N.""" if not ethosn_available(): return False - return support.sigmoid(extract) + return _ethosn.sigmoid(extract) def check_tanh(extract): """Check if tanh is supported by Ethos-N.""" if not ethosn_available(): return False - return support.tanh(extract) + return _ethosn.tanh(extract) def check_leaky_relu(extract): """Check if Leaky ReLU is supported.""" if not ethosn_available(): return False - return support.leaky_relu(extract) + return _ethosn.leaky_relu(extract) + + def check_mul(extract): + """Check if Mul is supported.""" + if not ethosn_available(): + return False + # Do not support scalar constants for now + check_scalar = lambda i: isinstance(i, tvm.relay.Constant) and len(i.data.shape) == 0 + if check_scalar(extract.args[0]) or check_scalar(extract.args[1]): + return False + extract = _ethosn.ConvertQnnMultiply(extract) + return _ethosn.conv2d(extract) def check_requantize(extract): """Check if requantize is supported.""" if not ethosn_available(): return False - return support.requantize(extract) + return _ethosn.requantize(extract) def check_resize(extract): """Check if resize (nearest neighbor) is supported.""" if not ethosn_available(): return False - return support.resize(extract) + return _ethosn.resize(extract) return [ + ("ethos-n.qnn_mul", qnn_mul_pattern(), check_mul), ("ethos-n.qnn_conv2d", qnn_conv_pattern(), check_conv2d), ("ethos-n.qnn_avg_pool2d", qnn_avg_pool2d_pattern(), check_avg_pool2d), ("ethos-n.qnn_sigmoid", qnn_sigmoid_pattern(), check_sigmoid), @@ -274,7 +318,7 @@ def max_pool2d(expr): if not ethosn_available(): return False - return support.max_pool2d(expr) + return _ethosn.max_pool2d(expr) @tvm.ir.register_op_attr("reshape", "target.ethos-n") @@ -285,7 +329,7 @@ def reshape(expr): if not _is_ethosn_composite(expr.args[0]): return False - return support.reshape(expr) + return _ethosn.reshape(expr) @tvm.ir.register_op_attr("qnn.add", "target.ethos-n") @@ -294,7 +338,7 @@ def qnn_add(expr): if not ethosn_available(): return False - return support.addition(expr) + return _ethosn.addition(expr) @tvm.ir.register_op_attr("qnn.concatenate", "target.ethos-n") @@ -302,7 +346,7 @@ def qnn_concatenate(expr): """Check if a concatenate is supported by Ethos-N.""" if not ethosn_available(): return False - if not support.concatenate(expr): + if not _ethosn.concatenate(expr): return False # Support library has some unenforced restrictions on qnn params @@ -332,7 +376,7 @@ def split(expr): return False if ethosn_api_version() >= LooseVersion("3.0.1"): return False - if not support.split(expr): + if not _ethosn.split(expr): return False return True @@ -343,7 +387,7 @@ def depth_to_space(expr): """Check if a depth_to_space is supported by Ethos-N.""" if not ethosn_available(): return False - if not support.depth_to_space(expr): + if not _ethosn.depth_to_space(expr): return False return True @@ -354,7 +398,7 @@ def clip(expr): """Check if a clip is supported by Ethos-N.""" if not ethosn_available(): return False - if not support.relu(expr): + if not _ethosn.relu(expr): return False return True diff --git a/src/relay/backend/contrib/ethosn/convert_equivalent.cc b/src/relay/backend/contrib/ethosn/convert_equivalent.cc new file mode 100644 index 000000000000..6b64467047f4 --- /dev/null +++ b/src/relay/backend/contrib/ethosn/convert_equivalent.cc @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/backend/contrib/ethosn/convert_equivalent.cc + * \brief Converts operations into a numerically equivalent form + * that can be understood by the NPU codegen. + */ + +#include +#include +#include + +#include + +#include "../../../qnn/utils.h" +#include "../../../transforms/pattern_utils.h" +#include "../../../transforms/simplify_expr.h" + +namespace tvm { +namespace relay { +namespace contrib { +namespace ethosn { + +/*! + * \brief Converts qnn.mul to mathematically equivalent + * qnn.conv2d depthwise operation. + */ +Expr ConvertQnnMultiply(const Expr& expr) { + Call call = Downcast(expr); + + Expr input1 = call->args[0]; + Expr input2 = call->args[1]; + Expr input1_scale = call->args[2]; + Expr input1_zero_point = call->args[3]; + Expr input2_scale = call->args[4]; + Expr input2_zero_point = call->args[5]; + // Reverse the inputs if the constant is first input + if (call->args[0]->IsInstance()) { + input1 = call->args[1]; + input2 = call->args[0]; + input1_scale = call->args[4]; + input1_zero_point = call->args[5]; + input2_scale = call->args[2]; + input2_zero_point = call->args[3]; + } + Expr output_scale = call->args[6]; + Expr output_zero_point = call->args[7]; + + const auto* input_constant = input2.as(); + ICHECK(input_constant) << "Expected ConstantNode but got " << input2->GetTypeKey(); + const auto* input_constant_tt = input_constant->checked_type().as(); + int channels = input_constant_tt->shape.back().as()->value; + + runtime::NDArray input_data = input_constant->data; + runtime::NDArray kernel_data_hwoi = + runtime::NDArray::Empty({1, 1, channels, 1}, input_data->dtype, input_data->device); + kernel_data_hwoi.CopyFrom(input_data); + Constant kernel = Constant(kernel_data_hwoi, input_constant->span); + + Type output_type = expr->checked_type(); + auto output_tt = output_type.as(); + ICHECK(output_tt) << "Expected TensorTypeNode but got " << output_type->GetTypeKey(); + DataType output_dtype = output_tt->dtype; + + Expr conv2d = qnn::MakeQnnConv2D( + input1, kernel, input1_zero_point, input2_zero_point, input1_scale, input2_scale, {1, 1}, + {0, 0, 0, 0}, {1, 1}, channels, channels, {1, 1}, "NHWC", "HWOI", "NHWC", DataType::Int(32)); + Constant bias_data = MakeConstantZeros(DataType::Int(32), {channels}); + Expr bias_add = MakeBiasAdd(conv2d, bias_data, 3); + Expr requantize = qnn::MakeRequantize(bias_add, input1_scale, input1_zero_point, output_scale, + output_zero_point, -1, "None", "None", output_dtype); + + return InferType(requantize); +} + +TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertQnnMultiply") + .set_body_typed(ConvertQnnMultiply); + +class ConvertEquivalentsMutator : public MixedModeMutator { + public: + Expr Rewrite_(const CallNode* pre, const Expr& post) override { + Call call = Downcast(post); + if (!call->op->IsInstance()) { + return post; + } + + Function func = Downcast(call->op); + Function new_func = Function(func); + auto composite_name = func->GetAttr(attr::kComposite); + if (composite_name == "ethos-n.qnn_mul") { + Expr new_func_body = ConvertQnnMultiply(func->body); + new_func = WithFields(func, func->params, new_func_body); + new_func = WithAttr(std::move(new_func), attr::kComposite, String("ethos-n.qnn_conv2d")); + } + + Call new_call = WithFields(call, new_func); + return Downcast(new_call); + } +}; + +tvm::transform::Pass ConvertEquivalents() { + runtime::TypedPackedFunc pass_func = + [=](IRModule mod, transform::PassContext ctx) { + for (auto gv : mod->GetGlobalVars()) { + Function func = Downcast(mod->Lookup(gv)); + auto compiler_name = func->GetAttr(attr::kCompiler); + if (compiler_name.defined() && compiler_name == "ethos-n") { + auto new_body = ConvertEquivalentsMutator().VisitExpr(func->body); + if (!new_body.same_as(func->body)) { + Function new_func = WithFields(func, func->params, new_body); + mod->Update(gv, new_func); + } + } + } + return mod; + }; + return tvm::transform::CreateModulePass( + pass_func, 0, "relay.backend.contrib.ethos-n.ConvertEquivalents", {"InferType"}); +} + +TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertEquivalents") + .set_body_typed(ConvertEquivalents); + +} // namespace ethosn +} // namespace contrib +} // namespace relay +} // namespace tvm diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h index c850bf8958c9..85938a739182 100644 --- a/src/relay/op/make_op.h +++ b/src/relay/op/make_op.h @@ -117,6 +117,8 @@ Expr MakeShapeOf(Expr data, DataType dtype); Expr MakeTake(Expr data, Expr indices, Integer batch_dims, Integer axis, String mode); +Expr MakeBiasAdd(Expr data, Expr bias, int axis); + } // namespace relay } // namespace tvm #endif // TVM_RELAY_OP_MAKE_OP_H_ diff --git a/src/relay/qnn/utils.h b/src/relay/qnn/utils.h index 18c592f2ed69..d084e4871e95 100644 --- a/src/relay/qnn/utils.h +++ b/src/relay/qnn/utils.h @@ -121,6 +121,10 @@ static inline Expr Requantize(const Expr& data, const Array& input_sh attrs.operator->(), input_shape, attrs->out_dtype); } +Expr MakeRequantize(Expr data, Expr input_scale, Expr input_zero_point, Expr output_scale, + Expr output_zero_point, int axis, String rounding, String compute_dtype, + DataType out_dtype); + Expr DequantizeLower(const Expr& input_tensor, const Expr& input_scale, const Expr& input_zero_point, const Array& types, const DequantizeAttrs* attrs); diff --git a/src/relay/transforms/pattern_utils.h b/src/relay/transforms/pattern_utils.h index d05d39b733d3..ffe1cc2ca2ab 100644 --- a/src/relay/transforms/pattern_utils.h +++ b/src/relay/transforms/pattern_utils.h @@ -344,6 +344,40 @@ static inline Constant MakeConstantTensor(DataType dtype, std::vector s return Constant(arr); } +/*! + * \brief Create a Constant tensor of zeros. + * + * \param dtype The data type. + * \param shape The shape of the output constant tensor. + * \return A Constant. + */ +static inline Constant MakeConstantZeros(DataType dtype, std::vector shape) { + runtime::NDArray arr = runtime::NDArray::Empty(shape, dtype, {kDLCPU, 0}); + int64_t data_size = 1; + for (int64_t dim : shape) { + data_size *= dim; + } + TVM_DTYPE_DISPATCH(dtype, DType, { + for (int64_t i = 0; i < data_size; i++) { + if (dtype == DataType::Float(16)) { + // convert to float16 + // storage is uint16_t + // Similar handling as that in MakeConstantScalar + *(static_cast(arr->data) + i) = + __truncXfYf2__(static_cast(0)); + } else if (dtype == DataType::BFloat(16)) { + // convert to bfloat16 + // storage is uint16_t + *(static_cast(arr->data) + i) = + __truncXfYf2__(static_cast(0)); + } else { + *(static_cast(arr->data) + i) = 0; + } + } + }) + return Constant(arr); +} + /*! * \brief Check whether a shape is static and create corresponding Constant. Eventually this will be removed and replaced with CheckConstantShapeArrayInteger diff --git a/tests/python/contrib/test_ethosn/test_convert_equivalents.py b/tests/python/contrib/test_ethosn/test_convert_equivalents.py new file mode 100644 index 000000000000..570009422067 --- /dev/null +++ b/tests/python/contrib/test_ethosn/test_convert_equivalents.py @@ -0,0 +1,142 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Unit tests for the convert equivalents pass.""" + +import pytest +import numpy as np + +import tvm +from tvm import relay +from tvm.testing import requires_ethosn +from tvm.relay.op.contrib.ethosn import ConvertEquivalents + +from . import infrastructure as tei + + +def _assert_structural_equal(a, b): + """Check structural equality of two Relay expressions.""" + reason = ( + "Actual and expected relay functions are not equal. " + "ConvertEquivalents is not correctly transforming the input " + "graph." + ) + assert tvm.ir.structural_equal(a, b), reason + + +def _create_npu_module(inputs, expr, composite_name, ext_func_name): + """Wraps an operator as an NPU module.""" + gen_vars = lambda prefix, vars: [ + relay.var( + prefix + var.name_hint, shape=var.type_annotation.shape, dtype=var.type_annotation.dtype + ) + for var in vars + ] + + mod = tvm.ir.IRModule() + + func = relay.Function(relay.analysis.free_vars(expr), expr) + func = func.with_attr("Composite", composite_name) + inner_vars = gen_vars("inner_", inputs) + call = relay.Call(func, inner_vars) + + func2 = relay.Function(relay.analysis.free_vars(call), call) + func2 = func2.with_attr("Compiler", "ethos-n") + func2 = func2.with_attr("global_symbol", ext_func_name) + mod[ext_func_name] = func2 + mod = relay.transform.InferType()(mod) + + outer_vars = gen_vars("outer_", inputs) + out = relay.Call(mod.get_global_var(ext_func_name), outer_vars) + mod["main"] = relay.Function(relay.analysis.free_vars(out), out) + mod = relay.transform.InferType()(mod) + return mod + + +@requires_ethosn +@pytest.mark.parametrize("dtype", ["uint8", "int8"]) +@pytest.mark.parametrize("shape,channels", [((1, 4, 4, 8), 8), ((1, 16, 12, 4), 4)]) +@pytest.mark.parametrize("reverse_inputs", [True, False]) +def test_multiply_to_depthwise(dtype, shape, channels, reverse_inputs): + """Check that multiply is correctly converted to a depthwise operation.""" + np.random.seed(0) + + iinfo = np.iinfo(dtype) + data_min = iinfo.min + data_max = iinfo.max + input_zp = np.random.randint(data_min, data_max) + input_sc = np.random.random() * 2 + input2_zp = np.random.randint(data_min, data_max) + input2_sc = np.random.random() * 2 + output_zp, output_sc = tei.get_conv2d_qnn_params( + dtype, input_zp, input_sc, input2_zp, input2_sc, 1, 1, shape[3] + ) + x = relay.var("x", shape=shape, dtype=dtype) + constant_shape = (1, 1, 1, channels) + y_data = np.random.randint(data_min, data_max + 1, size=constant_shape, dtype=dtype) + + def before(): + y = relay.const(y_data, dtype=dtype) + expr = relay.qnn.op.mul( + y if reverse_inputs else x, + x if reverse_inputs else y, + relay.const(input_sc, "float32"), + relay.const(input_zp, "int32"), + relay.const(input2_sc, "float32"), + relay.const(input2_zp, "int32"), + relay.const(output_sc, "float32"), + relay.const(output_zp, "int32"), + ) + return _create_npu_module([x], expr, "ethos-n.qnn_mul", "ext_func") + + def expected(): + constant_shape_hwoi = (1, 1, channels, 1) + y_data_hwoi = y_data.reshape(constant_shape_hwoi) + y_hwoi = relay.const(y_data_hwoi, dtype=dtype) + expr = relay.qnn.op.conv2d( + x, + y_hwoi, + relay.const(input2_zp if reverse_inputs else input_zp, "int32"), + relay.const(input_zp if reverse_inputs else input2_zp, "int32"), + relay.const(input2_sc if reverse_inputs else input_sc, "float32"), + relay.const(input_sc if reverse_inputs else input2_sc, "float32"), + (1, 1), + channels, + (1, 1), + (0, 0), + (1, 1), + channels, + "NHWC", + "HWOI", + "NHWC", + "int32", + ) + expr = relay.nn.bias_add(expr, relay.const(np.zeros((channels,), dtype="int32")), axis=3) + expr = relay.qnn.op.requantize( + expr, + relay.const(input2_sc if reverse_inputs else input_sc, "float32"), + relay.const(input2_zp if reverse_inputs else input_zp, "int32"), + relay.const(output_sc, "float32"), + relay.const(output_zp, "int32"), + out_dtype=dtype, + ) + return _create_npu_module([x], expr, "ethos-n.qnn_conv2d", "ext_func") + + mod = before() + mod = ConvertEquivalents()(mod) + expected_mod = expected() + _assert_structural_equal(mod["ext_func"], expected_mod["ext_func"]) diff --git a/tests/python/contrib/test_ethosn/test_multiply.py b/tests/python/contrib/test_ethosn/test_multiply.py new file mode 100644 index 000000000000..38d8516b6721 --- /dev/null +++ b/tests/python/contrib/test_ethosn/test_multiply.py @@ -0,0 +1,193 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Integration tests for Multiply.""" + +import pytest +import numpy as np + +import tvm +from tvm import relay +from tvm.testing import requires_ethosn + +from . import infrastructure as tei + + +def _get_model( + shape, + constant_shape, + input_zp, + input_sc, + input2_zp, + input2_sc, + output_zp, + output_sc, + dtype, + reverse_inputs=False, +): + iinfo = np.iinfo(dtype) + data_min = iinfo.min + data_max = iinfo.max + + x = relay.var("x", shape=shape, dtype=dtype) + y_data = np.random.randint(data_min, data_max + 1, size=constant_shape, dtype=dtype) + y = relay.const(y_data, dtype=dtype) + + out = relay.qnn.op.mul( + y if reverse_inputs else x, + x if reverse_inputs else y, + relay.const(input_sc, "float32"), + relay.const(input_zp, "int32"), + relay.const(input2_sc, "float32"), + relay.const(input2_zp, "int32"), + relay.const(output_sc, "float32"), + relay.const(output_zp, "int32"), + ) + params = {"y": y_data} + return out, params + + +@requires_ethosn +@pytest.mark.parametrize("dtype", ["uint8", "int8"]) +@pytest.mark.parametrize( + "shape,constant_shape", [((1, 4, 4, 8), (1, 1, 1, 8)), ((1, 16, 12, 4), (4,))] +) +@pytest.mark.parametrize("reverse_inputs", [False, True]) +def test_multiply(dtype, shape, constant_shape, reverse_inputs): + """Compare Multiply output with TVM.""" + np.random.seed(0) + + iinfo = np.iinfo(dtype) + data_min = iinfo.min + data_max = iinfo.max + input_zp = np.random.randint(data_min, data_max) + input_sc = np.random.random() * 2 + input2_zp = np.random.randint(data_min, data_max) + input2_sc = np.random.random() * 2 + output_zp, output_sc = tei.get_conv2d_qnn_params( + dtype, input_zp, input_sc, input2_zp, input2_sc, 1, 1, shape[3] + ) + + model, params = _get_model( + shape, + constant_shape, + input_zp, + input_sc, + input2_zp, + input2_sc, + output_zp, + output_sc, + dtype, + reverse_inputs, + ) + inputs = {"x": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=shape, dtype=dtype))} + outputs = [] + for npu in [False, True]: + mod = tei.make_module(model, params) + outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu)) + + tei.verify(outputs, dtype, 1) + + +@requires_ethosn +def test_multiply_multiple_inputs_unsupported(): + """Check multiply operator with two inputs is not offloaded.""" + np.random.seed(0) + + shape = (1, 4, 5, 6) + dtype = "int8" + + iinfo = np.iinfo(dtype) + data_min = iinfo.min + data_max = iinfo.max + input_zp = np.random.randint(data_min, data_max) + input_sc = np.random.random() * 2 + input2_zp = np.random.randint(data_min, data_max) + input2_sc = np.random.random() * 2 + output_zp, output_sc = tei.get_conv2d_qnn_params( + dtype, input_zp, input_sc, input2_zp, input2_sc, 1, 1, shape[3] + ) + + x = relay.var("x", shape=shape, dtype=dtype) + y = relay.var("y", shape=shape, dtype=dtype) + model = relay.qnn.op.mul( + x, + y, + relay.const(input_sc, "float32"), + relay.const(input_zp, "int32"), + relay.const(input2_sc, "float32"), + relay.const(input2_zp, "int32"), + relay.const(output_sc, "float32"), + relay.const(output_zp, "int32"), + ) + + expected_host_ops = 1 + npu_partitions = 0 + for npu in [False, True]: + mod = tei.make_module(model, {}) + tei.build( + mod, + {}, + npu=npu, + expected_host_ops=expected_host_ops, + npu_partitions=npu_partitions, + ) + + +@requires_ethosn +def test_multiply_unsupported_datatype(): + """Check multiply operator with unsupported datatype is not offloaded.""" + np.random.seed(0) + + shape = (1, 4, 5, 6) + dtype = "int16" + + iinfo = np.iinfo(dtype) + data_min = iinfo.min + data_max = iinfo.max + input_zp = np.random.randint(data_min, data_max) + input_sc = np.random.random() * 2 + input2_zp = np.random.randint(data_min, data_max) + input2_sc = np.random.random() * 2 + output_zp, output_sc = tei.get_conv2d_qnn_params( + dtype, input_zp, input_sc, input2_zp, input2_sc, 1, 1, shape[3] + ) + + x = relay.var("x", shape=shape, dtype=dtype) + y = relay.var("y", shape=shape, dtype=dtype) + model = relay.qnn.op.mul( + x, + y, + relay.const(input_sc, "float32"), + relay.const(input_zp, "int32"), + relay.const(input2_sc, "float32"), + relay.const(input2_zp, "int32"), + relay.const(output_sc, "float32"), + relay.const(output_zp, "int32"), + ) + + expected_host_ops = 1 + npu_partitions = 0 + for npu in [False, True]: + mod = tei.make_module(model, {}) + tei.build( + mod, + {}, + npu=npu, + expected_host_ops=expected_host_ops, + npu_partitions=npu_partitions, + ) From 038523e5a21e13ff2802913ec32b73fb47413b35 Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Wed, 24 Aug 2022 07:13:28 -0700 Subject: [PATCH 029/704] [TIR] Expose Vector-related API in Python (#12571) This PR exposes the following TIR operation in python: - `vectorlow`: tested [here](https://github.com/apache/tvm/blob/592148abf6866a41eefa736efca067d42f5aea86/python/tvm/tir/tensor_intrin/arm_cpu.py#L62) - `vectorhigh`: tested [here](https://github.com/apache/tvm/blob/592148abf6866a41eefa736efca067d42f5aea86/python/tvm/tir/tensor_intrin/arm_cpu.py#L79) - `vectorcombine`: add new unittest Co-Authored-By: yongwww --- python/tvm/tir/__init__.py | 1 + python/tvm/tir/op.py | 57 ++++++++++++++++++++++ tests/python/unittest/test_tir_op_types.py | 24 +++++++++ 3 files changed, 82 insertions(+) diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py index 7ea8c02bed85..f61e05cc92e9 100644 --- a/python/tvm/tir/__init__.py +++ b/python/tvm/tir/__init__.py @@ -52,6 +52,7 @@ from .op import tvm_tuple, tvm_struct_get, tvm_struct_set from .op import address_of, lookup_param, assume, undef from .op import tvm_thread_allreduce, type_annotation, tvm_access_ptr, tvm_throw_last_error +from .op import vectorlow, vectorhigh, vectorcombine from .op import infinity, reinterpret from .op import exp, exp2, exp10, log, log2, log10, log1p, ldexp, clz from .op import sin, sinh, asin, asinh diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py index 7ab1f3aaae23..c4618042b2dc 100644 --- a/python/tvm/tir/op.py +++ b/python/tvm/tir/op.py @@ -595,6 +595,63 @@ def tvm_throw_last_error(): return call_intrin("handle", "tir.tvm_throw_last_error") +def vectorlow(dtype, vec): + """Get the low level half of the vector + + Parameters + ---------- + dtype : str + The data type of the result. + + vec : list + The input vector. + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin(dtype, "tir.vectorlow", vec) + + +def vectorhigh(dtype, vec): + """Get the high level half of the vector + + Parameters + ---------- + dtype : str + The data type of the result. + + vec : list + The input vector. + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin(dtype, "tir.vectorhigh", vec) + + +def vectorcombine(dtype, vec1, vec2): + """Concat two vectors + + Parameters + ---------- + vec1 : list + The input vector. + + vec2 : list + The input vector. + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin(dtype, "tir.vectorcombine", vec1, vec2) + + def ret(val): """Create a tir return expression diff --git a/tests/python/unittest/test_tir_op_types.py b/tests/python/unittest/test_tir_op_types.py index ffee3b3b57c9..3f0ec37adb85 100644 --- a/tests/python/unittest/test_tir_op_types.py +++ b/tests/python/unittest/test_tir_op_types.py @@ -104,6 +104,27 @@ def test_tir_op_tvm_throw_last_error(): assert expr.op.name == "tir.tvm_throw_last_error" +def test_tir_op_vectorlow(): + buffer = tir.decl_buffer((4, 4), "int8", offset_factor=1) + vec = buffer.vload([0, 0], dtype="int8x16") + expr = tir.vectorlow("int8x8", vec) + assert expr.op.name == "tir.vectorlow" + + +def test_tir_op_vectorhigh(): + buffer = tir.decl_buffer((4, 4), "int8", offset_factor=1) + vec = buffer.vload([0, 0], dtype="int8x16") + expr = tir.vectorhigh("int8x8", vec) + assert expr.op.name == "tir.vectorhigh" + + +def test_tir_op_vectorcombine(): + buffer = tir.decl_buffer((4, 4), "int8", offset_factor=1) + vec = buffer.vload([0, 0], dtype="int8x16") + expr = tir.vectorcombine("int8x8", vec, vec) + assert expr.op.name == "tir.vectorcombine" + + def test_tir_op_TVMBackendAllocWorkspace(): expr = tir.TVMBackendAllocWorkspace(0, 1, 2, 3, 4) assert expr.op.name == "tir.TVMBackendAllocWorkspace" @@ -130,5 +151,8 @@ def test_tir_op_TVMBackendFreeWorkspace(): test_tir_op_type_annotation() test_tir_op_tvm_access_ptr() test_tir_op_tvm_throw_last_error() + test_tir_op_vectorlow() + test_tir_op_vectorhigh() + test_tir_op_vectorcombine() test_tir_op_TVMBackendAllocWorkspace() test_tir_op_TVMBackendFreeWorkspace() From bf65b396c15b3cbec18fb1aecfa6862f58a2f307 Mon Sep 17 00:00:00 2001 From: Farshid Salemi Parizi Date: Wed, 24 Aug 2022 08:29:30 -0700 Subject: [PATCH 030/704] [Hexagon] Add support to run on multiple devices (#12504) * working in parralel using worker * creating launchers per test and clean up * clean up * ci change to distrube tests * ci work with any number of devices * fix running on simulator * adding function docstring * fix android_serial_number to always return a list of string * lint issue * fix internal error when skipping tests while androideserial number is not set * lint issue --- python/tvm/contrib/hexagon/pytest_plugin.py | 60 +++++++++++++++------ tests/scripts/setup-pytest-env.sh | 2 +- tests/scripts/task_python_hexagon.sh | 14 ++++- 3 files changed, 59 insertions(+), 17 deletions(-) diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py index f735c81ee0aa..65475d67f555 100644 --- a/python/tvm/contrib/hexagon/pytest_plugin.py +++ b/python/tvm/contrib/hexagon/pytest_plugin.py @@ -56,13 +56,16 @@ def _compose(args, decs): requires_hexagon_toolchain = tvm.testing.requires_hexagon(support_required="compile-only") -@pytest.fixture(scope="session") def android_serial_number() -> Optional[str]: + """Return the android serial number""" serial = os.getenv(ANDROID_SERIAL_NUMBER, default="") # Setting ANDROID_SERIAL_NUMBER to an empty string should be # equivalent to having it unset. if not serial.strip(): - serial = None + return None + + # Split android serial numbers into a list + serial = serial.split(",") return serial @@ -155,12 +158,16 @@ def adb_server_socket() -> str: @pytest.fixture(scope="session") def hexagon_server_process( - request, android_serial_number, rpc_server_port_for_session, adb_server_socket, skip_rpc + request, rpc_server_port_for_session, adb_server_socket, skip_rpc ) -> HexagonLauncherRPC: """Initials and returns hexagon launcher if ANDROID_SERIAL_NUMBER is defined. This launcher is started only once per test session. """ - if android_serial_number is None or android_serial_number == "simulator": + android_serial_num = android_serial_number() + + if android_serial_num is None: + pytest.skip("ANDROID_SERIAL_NUMBER is not set.") + if android_serial_num == ["simulator"]: yield None else: # Requesting these fixtures sets up a local tracker, if one @@ -175,16 +182,37 @@ def hexagon_server_process( "rpc_server_port": rpc_server_port_for_session, "adb_server_socket": adb_server_socket, } - launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info) + workerinput = getattr(request.config, "workerinput", None) + if workerinput is None: # single-process execution + device_adr = read_device_list()[0] + else: # running in a subprocess here + device_adr = workerinput["device_adr"] + launcher = HexagonLauncher(serial_number=device_adr, rpc_info=rpc_info) try: if not skip_rpc: launcher.start_server() - yield launcher + yield {"launcher": launcher, "device_adr": device_adr} finally: if not skip_rpc: launcher.stop_server() +def read_device_list(): + return android_serial_number() + + +def pytest_configure(config): + # read device list if we are on the master + if not hasattr(config, "workerinput"): + config.iplist = read_device_list() + + +def pytest_configure_node(node): + # the master for each node fills slaveinput dictionary + # which pytest-xdist will transfer to the subprocess + node.workerinput["device_adr"] = node.config.iplist.pop() + + @pytest.fixture def hexagon_launcher( hexagon_server_process, @@ -192,14 +220,12 @@ def hexagon_launcher( tvm_tracker_host, tvm_tracker_port, adb_server_socket, - android_serial_number, ) -> HexagonLauncherRPC: """Initials and returns hexagon launcher which reuses RPC info and Android serial number.""" - if android_serial_number is None: - yield None + android_serial_num = android_serial_number() - if android_serial_number != "simulator": - rpc_info = hexagon_server_process._rpc_info + if android_serial_num != ["simulator"]: + rpc_info = hexagon_server_process["launcher"]._rpc_info else: rpc_info = { "rpc_tracker_host": tvm_tracker_host, @@ -208,13 +234,17 @@ def hexagon_launcher( "adb_server_socket": adb_server_socket, } - launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info) try: - if android_serial_number == "simulator": + if android_serial_num == ["simulator"]: + launcher = HexagonLauncher(serial_number=android_serial_num[0], rpc_info=rpc_info) launcher.start_server() + else: + launcher = HexagonLauncher( + serial_number=hexagon_server_process["device_adr"], rpc_info=rpc_info + ) yield launcher finally: - if android_serial_number == "simulator": + if android_serial_num == ["simulator"]: launcher.stop_server() launcher.cleanup_directory() @@ -239,7 +269,7 @@ def terminate_rpc_servers(): # yield happens every time. serial = os.environ.get(ANDROID_SERIAL_NUMBER) yield [] - if serial == "simulator": + if serial == ["simulator"]: os.system("ps ax | grep tvm_rpc_x86 | awk '{print $1}' | xargs kill") diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh index d27f008093e0..afb759c09356 100755 --- a/tests/scripts/setup-pytest-env.sh +++ b/tests/scripts/setup-pytest-env.sh @@ -74,7 +74,7 @@ function run_pytest() { suite_name="${test_suite_name}-${current_shard}-${ffi_type}" - if [[ ! "${extra_args[@]}" == *" -n"* ]]; then + if [ ! "${extra_args[@]}" == *" -n"* ] && [! "${extra_args[@]}" == *" -dist"* ]; then extra_args+=("-n=1") fi diff --git a/tests/scripts/task_python_hexagon.sh b/tests/scripts/task_python_hexagon.sh index c87bc9b250fa..f7c0a43c48e8 100755 --- a/tests/scripts/task_python_hexagon.sh +++ b/tests/scripts/task_python_hexagon.sh @@ -39,8 +39,20 @@ if [[ "${device_serial}" == "simulator" ]]; then export HEXAGON_SHARED_LINK_FLAGS="-Lbuild/hexagon_api_output -lhexagon_rpc_sim" fi +num_of_devices=0 +if [ ! "${device_serial}" == "simulator" ]; then + IFS=',' read -ra ADDR <<< "$device_serial" + for i in "${ADDR[@]}"; do + num_of_devices=$(($num_of_devices+1)) + done +fi + export ANDROID_SERIAL_NUMBER=${device_serial} -run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon +if [ "${device_serial}" == "simulator" ]; then + run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon +else + run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon --tx $num_of_devices*popen --dist=load +fi if [[ "${device_serial}" == "simulator" ]]; then kill ${TRACKER_PID} From f53ee0cecf96adad71db92d2a0c488ca2dd6bee7 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Wed, 24 Aug 2022 11:44:59 -0700 Subject: [PATCH 031/704] [Hexagon] Fix missing pytest import (#12565) * Add pytest * lint --- tests/python/contrib/test_hexagon/topi/test_cast_slice.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/python/contrib/test_hexagon/topi/test_cast_slice.py b/tests/python/contrib/test_hexagon/topi/test_cast_slice.py index 6569ce36bb0e..1b235a4daf52 100644 --- a/tests/python/contrib/test_hexagon/topi/test_cast_slice.py +++ b/tests/python/contrib/test_hexagon/topi/test_cast_slice.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. """ Tests for Hexagon slice cast ops """ +import pytest import numpy as np import tvm @@ -75,6 +76,7 @@ def test_cast_fp16_fp32_slice( """ if hexagon_session._launcher._serial_number != "simulator": pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957") + target_hexagon = tvm.target.hexagon("v69") target = tvm.target.Target(target_hexagon, host=target_hexagon) cast_input = te.placeholder(input_shape, name="A", dtype=dtype) From 1afd0593956066635ee49297b731726c9218c91c Mon Sep 17 00:00:00 2001 From: Jyotsna Verma <73191103+jverma-quic@users.noreply.github.com> Date: Wed, 24 Aug 2022 13:55:50 -0500 Subject: [PATCH 032/704] [TOPI][Hexagon] Implement quantized avgpool (#12340) * [TOPI][Hexagon] Implement quantized avgpool * Fix pylint errors * Needed to adjust input padding for int8 buffer layout * Fix formatting issue * Add unit test for fixed-point conversion utility function Also, address review comments. * Remove pytest.skip for test_avg_pool2d_slice.py to enable on-target testing * Fix formatting issue * Update python/tvm/topi/hexagon/utils.py Co-authored-by: Christian Convey * Update comments and error messages * Address review comments * Import Tuple from typing * Address pylint error Co-authored-by: Christian Convey --- python/tvm/topi/hexagon/__init__.py | 1 + python/tvm/topi/hexagon/qnn/__init__.py | 20 ++ python/tvm/topi/hexagon/qnn/avg_pool2d.py | 205 +++++++++++++++++ python/tvm/topi/hexagon/slice_ops/__init__.py | 2 +- .../tvm/topi/hexagon/slice_ops/avg_pool2d.py | 24 +- python/tvm/topi/hexagon/utils.py | 136 ++++++++++++ .../contrib/test_hexagon/infrastructure.py | 55 ++++- .../test_fixed_point_conversion.py | 64 ++++++ .../topi/test_avg_pool2d_slice.py | 209 +++++++++++------- 9 files changed, 625 insertions(+), 91 deletions(-) create mode 100644 python/tvm/topi/hexagon/qnn/__init__.py create mode 100644 python/tvm/topi/hexagon/qnn/avg_pool2d.py create mode 100644 tests/python/contrib/test_hexagon/test_fixed_point_conversion.py diff --git a/python/tvm/topi/hexagon/__init__.py b/python/tvm/topi/hexagon/__init__.py index 7b0aa59c8de3..dfe739288187 100644 --- a/python/tvm/topi/hexagon/__init__.py +++ b/python/tvm/topi/hexagon/__init__.py @@ -26,3 +26,4 @@ from .pooling import * from .reduce import * from .resize2d import * +from .qnn import * diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py new file mode 100644 index 000000000000..e27e3793d565 --- /dev/null +++ b/python/tvm/topi/hexagon/qnn/__init__.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" Computes and schedules for Hexagon quantized ops """ + +from .avg_pool2d import qnn_avg_pool2d_compute, qnn_avg_pool2d_schedule diff --git a/python/tvm/topi/hexagon/qnn/avg_pool2d.py b/python/tvm/topi/hexagon/qnn/avg_pool2d.py new file mode 100644 index 000000000000..4aac15cbdc17 --- /dev/null +++ b/python/tvm/topi/hexagon/qnn/avg_pool2d.py @@ -0,0 +1,205 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals + +""" Compute and schedule for quantized avg_pool2d op + +Please note the following assumptions made by the implementation: + +1) The input must be padded in advance to account for 'padding'. In addition, + both input and output must be padded as per the physical buffer layout. +2) The current implementation assumes 'count_include_pad' to be 'True'. It can be + modified to support 'False' case but the element count for the pooling window + must be pre-computed and provided as an input to reduce the run-time overhead. +3) 'padding' is ignored. It must be handled outside of the sliced op. +4) Please note that this implementation will not work if the output includes any + physical layout related padding as it can result into out-of-bound access + for the input. +""" + +from tvm import te +from tvm import tir +from ..utils import get_layout_transform_fn, get_fixed_point_value + + +def validate_out_shape(out_shape: list, in_shape: list, kernel: list, stride: list, dilation: list): + """Validate output shape""" + _, oh, ow, _ = out_shape + _, ih, iw, _ = in_shape + kh, kw = kernel + sh, sw = stride + dh, dw = dilation + if ih < (oh - 1) * sh + dh * (kh - 1) + 1: + raise RuntimeError("Output height is too large") + if iw < (ow - 1) * sw + dw * (kw - 1) + 1: + raise RuntimeError("Output width is too large") + + +def saturate(x: te.Tensor, dtype: str): + """Saturate value for the specified data type""" + return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype))) + + +def qnn_avg_pool2d_compute( + data: te.Tensor, + kernel: list, + stride: list, + dilation: list, + oshape: list, + odtype: str, + # quantization params: + input_zero_point: int, + input_scale: float, + output_zero_point: int, + output_scale: float, +): + """Compute for quantized avg_pool2d""" + kh, kw = kernel + rh = te.reduce_axis((0, kh), name="rh") + rw = te.reduce_axis((0, kw), name="rw") + ob, oh, ow, oc = oshape + if isinstance(ob, int): + validate_out_shape(oshape, data.shape, kernel, stride, dilation) + + if odtype == "uint8": + temp_dtype = "uint16" + elif odtype == "int8": + temp_dtype = "int16" + else: + raise RuntimeError(f"Unsupported output dtype, {odtype}'") + + sh, sw = stride + dh, dw = dilation + + PoolArea = kh * kw + + scale = input_scale / output_scale + scale_fixed_point, rsh = get_fixed_point_value(scale, "int16") + scale_with_area = scale_fixed_point // PoolArea + corr = (output_zero_point << rsh) - input_zero_point * scale_fixed_point + + Sum = te.compute( + oshape, + lambda b, h, w, c: te.sum( + data[b, h * sh + dh * rh, w * sw + dw * rw, c].astype(temp_dtype), axis=[rh, rw] + ), + name="sum", + ) + + Avg = te.compute( + oshape, + lambda b, h, w, c: saturate( + ((Sum[b, h, w, c] * scale_with_area) + corr) >> rsh, odtype + ).astype(odtype), + name="avg", + ) + return Avg + + +def schedule_nhwc_8h8w32c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str): + """Schedule for input and output layout nhwc-8h8w32c""" + func = te.create_prim_func([ins, outs]) + s = tir.Schedule(func) + Sum = s.get_block("sum") + Avg = s.get_block("avg") + + input_transform_fn = get_layout_transform_fn(input_layout) + output_transform_fn = get_layout_transform_fn(output_layout) + s.transform_layout(Sum, ("read", 0), input_transform_fn) + s.transform_layout(Avg, ("write", 0), output_transform_fn) + + # Schedule 'Avg' + # Split and reorder the axes to iterate over the output tensor chunks. + # Each chunk consists for 2048 bytes with 32 channels being the fastest + # changing axis, followed by 8 width and then 8 height. + # The width is split by a factor of 4 and then fused with 32 channels + # to provide full vector length of data for the output tensor chunks. + # NOTE: These schedules are a work in progress and may require + # adjustments in future as some of the missing features for 2-d tensors + # become available. + n, h, w, c = s.get_loops(Avg) + ho, hi = s.split(h, [None, 8]) + wo, wi = s.split(w, [None, 8]) + wio, wii = s.split(wi, [None, 4]) + co, ci = s.split(c, [None, 32]) + s.reorder(n, ho, wo, co, hi, wio, wii, ci) + wii_ci = s.fuse(wii, ci) + s.vectorize(wii_ci) + + # Schedule 'Sum' + s.compute_at(Sum, wio) + Sum_axis = s.get_loops(Sum) + # Compute for 'Sum' includes reduction along height and width. The axes + # are being reordered so that 4 width and 32 channels become the + # inner-most loops which then can be fused and vectorized. However, + # vectorization of the 2-d tensors doesn't work when reduction is + # involved and requires codegen support that is yet to be added. + s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-4], Sum_axis[-3]) + ci_wii = s.fuse(Sum_axis[-4], Sum_axis[-3]) + # s.vectorize(ci_wii) # Doesn't work + return s + + +def schedule_n11c_2048c(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str): + """Schedule for output layout: n11c-2048c, input layout: nhwc-8h8w32c""" + func = te.create_prim_func([ins, outs]) + s = tir.Schedule(func) + Sum = s.get_block("sum") + Avg = s.get_block("avg") + + input_transform_fn = get_layout_transform_fn(input_layout) + output_transform_fn = get_layout_transform_fn(output_layout) + s.transform_layout(Sum, ("read", 0), input_transform_fn) + s.transform_layout(Avg, ("write", 0), output_transform_fn) + + # Schedule 'Avg' + # Split and reorder the axes to iterate over the output tensor chunks. + # Each chunk consists for 2048 bytes. For n11c-2048c tensor layout, each chunk + # only contains 2048 channels which get split by a factor of 128 to be vectorized. + # NOTE: These schedules are a work in progress and may require + # adjustments in future as some of the missing features for 2-d tensors + # become available. + n, h, w, c = s.get_loops(Avg) + co, ci = s.split(c, [None, 2048]) + cio, cii = s.split(ci, [None, 128]) + s.vectorize(cii) + + # Schedule 'Sum' + # Compute for 'Sum' includes reduction along height and width. The axes are being + # reordered so that 128 channels become the inner-most loop and can be vectorized. + # However, vectorization of the 2-d tensors doesn't work when reduction is + # involved and requires codegen support that is yet to be added. + s.compute_at(Sum, cio) + Sum_axis = s.get_loops(Sum) + s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-3]) + # s.vectorize(Sum_axis[-3]) # Doesn't work + return s + + +def qnn_avg_pool2d_schedule(outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str): + """Quantized avg_pool2d schedule + + NOTE: This schedule assumes that both input and output tensors are in the form of + 2d discontiguous buffer and data is already arranged as per the input and output layout + respectively. + + """ + if output_layout == "nhwc-8h8w32c-2d": + return schedule_nhwc_8h8w32c(outs, ins, output_layout, input_layout) + if output_layout == "n11c-2048c-2d": + return schedule_n11c_2048c(outs, ins, output_layout, input_layout) + raise RuntimeError(f"Unexpected layout '{output_layout}'") diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py index cda63e2e1c73..b96156dc46d2 100644 --- a/python/tvm/topi/hexagon/slice_ops/__init__.py +++ b/python/tvm/topi/hexagon/slice_ops/__init__.py @@ -17,7 +17,7 @@ """ Computes and Schedules for Hexagon slice ops. """ -from .avg_pool2d import avg_pool2d_compute, avg_pool2d_STIR_schedule +from .avg_pool2d import avg_pool2d_compute, avg_pool2d_schedule from .max_pool2d import max_pool2d_compute, max_pool2d_STIR_schedule from .add_subtract_multiply import * from .argmax import argmax_compute, argmax_schedule diff --git a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py index 306be543d8fb..38e2ea577b68 100644 --- a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py +++ b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py @@ -49,33 +49,35 @@ def validate_out_shape(out_shape, in_shape, kernel, stride, dilation): raise RuntimeError("Output width is too large") -def avg_pool2d_compute(A, out_shape, kernel, stride, dilation): +def avg_pool2d_compute(A, kernel, stride, dilation, oshape, odtype="float16"): """avg_pool2d compute""" + if odtype != "float16": + RuntimeError(f"Unsupported output dtype '{odtype}'") kh, kw = kernel rh = te.reduce_axis((0, kh), name="rh") rw = te.reduce_axis((0, kw), name="rw") - ob, oh, ow, oc = out_shape + ob, oh, ow, oc = oshape if isinstance(ob, int): - validate_out_shape(out_shape, A.shape, kernel, stride, dilation) + validate_out_shape(oshape, A.shape, kernel, stride, dilation) sh, sw = stride dh, dw = dilation InvArea = float(1) / (kh * kw) Sum = te.compute( - out_shape, + oshape, lambda b, h, w, c: te.sum( A[b, h * sh + dh * rh, w * sw + dw * rw, c].astype("float32"), axis=[rh, rw] ), name="sum", ) Avg = te.compute( - out_shape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(A.dtype), name="avg" + oshape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(A.dtype), name="avg" ) return Avg -def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str): +def schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str): """Schedule for input and output layout nhwc-8h2w32c2w""" func = te.create_prim_func([ins, outs]) s = tir.Schedule(func) @@ -106,7 +108,7 @@ def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: st return s -def STIR_schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str): +def schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str): """Schedule for output layout: n11c-1024c, input layout: nhwc-8h2w32c2w""" func = te.create_prim_func([ins, outs]) s = tir.Schedule(func) @@ -132,10 +134,10 @@ def STIR_schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str): return s -def avg_pool2d_STIR_schedule(outs, ins, output_layout: str, input_layout: str): - """STIR based schedule""" +def avg_pool2d_schedule(outs, ins, output_layout: str, input_layout: str): + """avg_pool2d schedule""" if output_layout == "nhwc-8h2w32c2w-2d": - return STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout) + return schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout) if output_layout == "n11c-1024c-2d": - return STIR_schedule_n11c_1024c(outs, ins, output_layout, input_layout) + return schedule_n11c_1024c(outs, ins, output_layout, input_layout) raise RuntimeError(f"Unexpected layout '{output_layout}'") diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py index 3b8914ffe937..c056408947b7 100644 --- a/python/tvm/topi/hexagon/utils.py +++ b/python/tvm/topi/hexagon/utils.py @@ -19,6 +19,9 @@ """Common hexagon specific utilities""" +import math +import struct +from typing import Tuple from tvm import te @@ -102,6 +105,11 @@ def nhwc_8h8w32c_2d(n, h, w, c): return [n, h // 8, w // 8, c // 32, te.AXIS_SEPARATOR, h % 8, w % 8, c % 32] +def n11c_2048c_2d(n, h, w, c): + """Return index map for n11c_2048c 2d layout""" + return [n, h, w, c // 2048, te.AXIS_SEPARATOR, c % 2048] + + def iohw_16i32o2i_1d(height, width, in_channel, out_channel): return [ in_channel // 32, @@ -150,4 +158,132 @@ def get_layout_transform_fn(layout): return nc_2048_2d if layout == "nhwc-8h8w32c-2d": return nhwc_8h8w32c_2d + if layout == "n11c-2048c-2d": + return n11c_2048c_2d raise RuntimeError(f"Unexpected layout '{layout}'") + + +def get_fixed_point_value(flp: float, dtype: str = "int16") -> Tuple[int, int]: + """ + Return fixed-point value and the corresponding log2 of the scale factor used to compute + this value. + + Parameters + ---------- + flp : float + Floating-point value to be converted + dtype : str + Type of the resulting fixed-point value. By default, it's set to "int16" + + Returns + ------- + fixed_point_value : int + Fixed-point value for the given floating-point value + exp_scale_factor : int + log2 of the scale factor + + Convert floating-point value into fixed-point number. This is done by + multiplying the value by a scaling factor and then rounding it to the nearest + integer value. + + As per IEEE-754 standard, a floating-point value can be represented as follows + [see: https://en.wikipedia.org/wiki/IEEE_754-1985]: + (-1)^S * M * 2^(E-Bias) + + Here, + * S is the signed bit (0 or 1). + * M is the mantissa. It's composed of an implicit 1 for the normalized floating-point + values or 0 for the denormalized values, and the fraction part. This ensures that + mantissa is always within [0, 2) range. Please note that this function doesn't + handle denormalized values. + * E is the exponent. + + In single precision, 23 bits are used to represent the fraction part of + the mantissa (and therefore, '23' shows up in one of the computations below) and + 8 bits are used for the exponent. Since exponent field needs to reperesent both + positive and negative values, a bias (127 for single precision) is added to the actual + value. Therefore, to compute the actual exponent, 127 must be subtracted from the stored + value. + + As mentioned above, to find the corresponding fixed-point number, we multiply the + value with a scaling factor and then round it to the nearest integer. The scaling factor + is chosen to be a power for 2 and it's the largest value that can be safely multiplied + to the floating-point value, without causing the resulting value to overflow the range + of the integer type used to represent the fixed-point value. + + So, if we assume the scaling factor to be 2^x, the resulting fixed-point value will be: + round((-1)^S * (M) * 2^(E-Bias) * 2^x) + + This can be simplified to: + round((-1)^S * M * 2^(E-Bias+x) + + Now, if 'int16' is used for fixed-point value, then it has to be >= -(2 * 2^14) + and <= (2 * 2^14) - 1. Since M (Mantissa) is always < 2, in order for the fixed-point value + to be within this range, 2^(E - Bias + x) must be <= 2^14 - 1. + And, if we ignore -1, (E - Bias + x) should be <= 14. Note: if mantissa gets too close to 2, + this will cause the resulting value to go out of range and require it to be saturated. + In the following implementation, we perform range check and adjust the scale to avoid + saturation. + For most cases, 2^x, where x = 14 - (E - Bias) or 14 - (E - 127) for single precision, is the + best scaling factor for 'int16' type that can be used to convert the floating-point value to + fixed-point with the least amount of precision loss. + + Additonal notes on various floating-point values: + ------------------------------------------------ + 1) Denormalized values: causes assertion failure. The problem with the denormalized values + is that they require a very large scale factor (>= 2^127) to be converted to a fixed-point + value. As the denormalzied values get smaller, the scale factor becomes too large to be + represented as a IEEE-754 floating point value (as being done in the computaton below) + and therefore, the denormalized values aren't being handled here. + 2) NaN and INF: assertion failure + """ + + def within_range(val, dtype): + if dtype == "int16": + return -32768 <= val <= 32767 + raise RuntimeError(f"Unsupported dtype, {dtype}'") + + # Make sure that 'flp' isn't NaN or infinity + if math.isnan(flp) or math.isinf(flp): + raise RuntimeError("NaN or INF can not be represented as fixed-point") + + flp_f = struct.pack("f", flp) + flp_i = struct.unpack("I", flp_f) + exp_stored_value = (flp_i[0] >> 23) & 0xFF + + if exp_stored_value == 0: + raise RuntimeError( + "Denormalized values are not considered for float -> fixed-point conversion!" + ) + + exp_value = ((flp_i[0] >> 23) & 0xFF) - 127 + if dtype == "int16": + max_bits = 14 + else: + raise RuntimeError(f"Unsupported dtype, {dtype}'") + + exp_scale_factor = max_bits - exp_value # log2 of the scale_factor + + if exp_scale_factor > 127: + raise RuntimeError("Value too small for fixed-point conversion!") + + # Scaling factor = 2^exp_scale_factor + # Since exp_scale_factor can be -ve or +ve, scaling factor is calculated by first + # representing the value in the binary format as per IEEE floating-point standand and then + # reinterpreting it as a float using struct.pack and struct.unpack functions. + # struct.pack returns a bytes object packed as integer and struct.unpack + # unpacks this bytes object into float. + scale = ((exp_scale_factor + 127) & 0xFF) << 23 + scale_i = struct.pack("I", scale) + scale_f = struct.unpack("f", scale_i) + fixed_point_value = int(round(flp * scale_f[0])) + + if not within_range(fixed_point_value, dtype): + # Adjust scale factor to avoid overflow. + exp_scale_factor -= 1 + scale = ((exp_scale_factor + 127) & 0xFF) << 23 + scale_i = struct.pack("I", scale) + scale_f = struct.unpack("f", scale_i) + fixed_point_value = int(round(flp * scale_f[0])) + + return fixed_point_value, exp_scale_factor diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py index ab5f62498262..70e50fcb68d6 100644 --- a/tests/python/contrib/test_hexagon/infrastructure.py +++ b/tests/python/contrib/test_hexagon/infrastructure.py @@ -267,8 +267,8 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str): assert h == 1 and w == 1, "The size of h and w must be 1" return arr_np.reshape([n, 1, 1, c // 1024, 1024]) if new_layout == "nc-1024-2d": - N, C = arr_np.shape - return arr_np.reshape([N, C // 1024, 1024]) + n, c = arr_np.shape + return arr_np.reshape([n, c // 1024, 1024]) if new_layout == "nhwc-1024c-2d": N, H, W, C = arr_np.shape return arr_np.reshape([N, H, W, C // 1024, 1024]) @@ -278,11 +278,16 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str): if new_layout == "nhwc-2048c-2d": N, H, W, C = arr_np.shape return arr_np.reshape([N, H, W, C // 2048, 2048]) - if new_layout in ["nhwc-8h8w32c-2d"]: + if new_layout == "nhwc-8h8w32c-2d": n, h, w, c = arr_np.shape return arr_np.reshape([n, h // 8, 8, w // 8, 8, c // 32, 32]).transpose( 0, 1, 3, 5, 2, 4, 6 ) + if new_layout == "n11c-2048c-2d": + n, h, w, c = arr_np.shape + assert h == 1 and w == 1, "The size of h and w must be 1" + return arr_np.reshape([n, h, w, c // 2048, 2048]) + raise RuntimeError(f"Unexpected new_layout '{new_layout}'") if current_layout == "nc": n, c = arr_np.shape @@ -300,3 +305,47 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str): raise RuntimeError(f"Unexpected new_layout '{new_layout}'") raise RuntimeError(f"Unexpected current_layout '{current_layout}'") + + +def quantize_np(arr_np: numpy.ndarray, dtype: str): + """ + Returns quantized array along with scale and zero-point + + Parameters + ---------- + arr_np: numpy.ndarray + Input numpy array to be quantized + dtype: str + dtype of the quantized array: "uint8", "int8", etc + + Returns + ------- + quant_np: numpy.ndarray + Quantized numpy array + scale: float + Scale + zero_point: int + Value corresponding to float 0 + + """ + if dtype == "uint8": + qmax = 255 + qmin = 0 + elif dtype == "int8": + qmax = 128 + qmin = -127 + else: + raise RuntimeError(f"Unsupported quantized data type '{dtype}'") + fmin = numpy.amin(arr_np) + fmax = numpy.amax(arr_np) + + # Include floating-point zero in the range + if fmax < 0: + fmax = 0.0 + elif fmin > 0: + fmin = 0.0 + + scale = (fmax - fmin) / (qmax - qmin) + zero_point = numpy.rint((fmax * qmin - fmin * qmax) / (fmax - fmin)).astype("int32") + quant_np = (arr_np / scale + zero_point).astype(dtype) + return quant_np, scale, zero_point diff --git a/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py b/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py new file mode 100644 index 000000000000..5ec46cf4ae70 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_fixed_point_conversion.py @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import math +import struct +import numpy as np +import tvm.topi.hexagon.utils as utils + +""" +Test float to fixed-point conversion. We do it by constructing a numpy array with the +wide range of floating-point values. These values are converted into the +fixed-point value using topi.hexagon.utils.get_fixed_point_value. Then, these values are +converted back into float using scale_factor provided by the function. These converted +floating point values are then compared against the original values and an assertion is +raised if they happened to be outside of the expected tolerance. +""" + + +class TestFixedPointConversion: + def test_fixed_point_conversion(self): + # Construct array with wide range of values + fp1 = np.random.uniform(0.00001, 0.0002, size=(10)) + fp2 = np.random.uniform(0.001, 0.02, size=(10)) + fp3 = np.random.uniform(1, 20, size=(10)) + fp4 = np.random.uniform(900, 1000, size=(10)) + fp5 = np.random.uniform(1e9, 1e10, size=(10)) + + # Test for values with largest possible exponent as per IEEE-754 floating-point + # standard (actual exp value = 127, stored exp value = 254). + fp6 = np.random.uniform(2.4e38, 2.5e38, size=(1)) + + # Test for very small floating-point values. + fp7 = np.random.uniform(1.4e-34, 1.7e-34, size=(1)) + + float_arr = np.concatenate((fp1, fp2, fp3, fp4, fp5, fp6, fp7)) + for flp in float_arr: + fxp, rsh = utils.get_fixed_point_value(flp, "int16") + # Compute scale_factor using rsh (rsh is log2 of the scale_factor). While doing this, + # we use IEEE-754 floating-point representation since rsh can be negative or positive. + + scale = ((rsh + 127) & 0xFF) << 23 # Add bias (127) and position it into exponent bits + scale_i = struct.pack("I", scale) # Pack it as integer + scale_f = struct.unpack("f", scale_i) # Unpack as float + + converted_flp = fxp / scale_f[0] + assert math.isclose(flp, converted_flp, rel_tol=1e-2) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py index af60e0f2e084..743519901542 100644 --- a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py +++ b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py @@ -25,35 +25,67 @@ from tvm.contrib.hexagon.build import HexagonLauncher from tvm.contrib.hexagon.session import Session import tvm.topi.hexagon.slice_ops as sl -from ..infrastructure import allocate_hexagon_array, transform_numpy +import tvm.topi.hexagon.qnn as qn +from ..infrastructure import allocate_hexagon_array, transform_numpy, quantize_np from ..pytest_util import ( get_multitest_ids, create_populated_numpy_ndarray, - TensorContentConstant, TensorContentRandom, - TensorContentDtypeMin, - TensorContentDtypeMax, ) - input_layout = tvm.testing.parameter( "nhwc-8h2w32c2w-2d", ) +dtype = tvm.testing.parameter("float16", "uint8") + + +@tvm.testing.fixture +def output_layout(output_shape, dtype): + o_b, o_h, o_w, o_c = output_shape + if dtype == "float16": + if o_h == 1 and o_w == 1: + return "n11c-1024c-2d" + else: + assert o_h % 8 == 0 and o_w % 4 == 0, "Invalid output shape" + return "nhwc-8h2w32c2w-2d" + elif dtype == "int8" or "uint8": + if o_h == 1 and o_w == 1: + return "n11c-2048c-2d" + else: + assert o_h % 8 == 0 and o_w % 8 == 0, "Invalid output shape" + return "nhwc-8h8w32c-2d" + else: + raise RuntimeError(f"Unsupported data type '{dtype}'") + @tvm.testing.fixture def input_np(input_shape, dtype: str, input_tensor_populator): + if dtype == "uint8": + dtype = "float32" # Use "float32" input which will be quantized later return create_populated_numpy_ndarray(input_shape, dtype, input_tensor_populator) @tvm.testing.fixture -def transformed_expected_output_np(expected_output_np, output_layout): - return transform_numpy(expected_output_np, "nhwc", output_layout) +def transformed_expected_output_np(expected_output_np, output_layout, dtype): + if dtype == "float16": + return transform_numpy(expected_output_np, "nhwc", output_layout) + elif dtype in ("uint8", "int8"): + quant_arr, scale, zero_point = quantize_np(expected_output_np, dtype) + return [transform_numpy(quant_arr, "nhwc", output_layout), scale, zero_point] + else: + raise RuntimeError(f"Unsupported data type '{dtype}'") @tvm.testing.fixture -def transformed_input_np_padded(input_np_padded, input_layout): - return transform_numpy(input_np_padded, "nhwc", input_layout) +def transformed_input_np_padded(input_np_padded, input_layout, dtype): + if dtype == "float16": + return transform_numpy(input_np_padded, "nhwc", input_layout) + elif dtype in ("uint8", "int8"): + quant_arr, scale, zero_point = quantize_np(input_np_padded, dtype) + return [transform_numpy(quant_arr, "nhwc", input_layout), scale, zero_point] + else: + raise RuntimeError(f"Unsupported data type '{dtype}'") class TestAvgPool2dSlice: @@ -65,8 +97,6 @@ class TestAvgPool2dSlice: "pad", # padding "ceil", # ceil_mode "cnt_padded", # count_include_pad - "out_layout", # output_layout - None, # dtype None, # input_tensor_populator ] @@ -79,8 +109,6 @@ class TestAvgPool2dSlice: [0, 0, 0, 0], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), ( @@ -91,8 +119,6 @@ class TestAvgPool2dSlice: [0, 0, 0, 0], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), ( @@ -103,8 +129,6 @@ class TestAvgPool2dSlice: [0, 0, 0, 0], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), # Test non-one stride and dilation @@ -116,8 +140,6 @@ class TestAvgPool2dSlice: [0, 0, 0, 0], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), ( @@ -128,8 +150,6 @@ class TestAvgPool2dSlice: [0, 0, 0, 0], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), ( @@ -140,8 +160,6 @@ class TestAvgPool2dSlice: [0, 0, 0, 0], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), # Test non-zero padding @@ -153,8 +171,6 @@ class TestAvgPool2dSlice: [1, 1, 1, 1], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), ( @@ -165,8 +181,6 @@ class TestAvgPool2dSlice: [1, 2, 3, 4], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), ( @@ -177,8 +191,6 @@ class TestAvgPool2dSlice: [1, 2, 3, 4], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), ( @@ -189,8 +201,6 @@ class TestAvgPool2dSlice: [1, 2, 3, 4], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), # Test n11c-1024c-2d layout which will require input and output to have different layout @@ -202,8 +212,6 @@ class TestAvgPool2dSlice: [0, 0, 0, 0], False, True, - "n11c-1024c-2d", - "float16", TensorContentRandom(), ), ( @@ -214,8 +222,6 @@ class TestAvgPool2dSlice: [0, 0, 0, 0], False, True, - "n11c-1024c-2d", - "float16", TensorContentRandom(), ), ( @@ -226,8 +232,6 @@ class TestAvgPool2dSlice: [0, 0, 0, 0], False, True, - "n11c-1024c-2d", - "float16", TensorContentRandom(), ), ( @@ -238,8 +242,6 @@ class TestAvgPool2dSlice: [0, 0, 0, 0], False, True, - "n11c-1024c-2d", - "float16", TensorContentRandom(), ), ] @@ -255,8 +257,6 @@ class TestAvgPool2dSlice: padding, ceil_mode, count_include_pad, - output_layout, - dtype, input_tensor_populator, ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids) @@ -309,15 +309,32 @@ def input_shape(self, output_shape, kernel, padding, stride, dilation, output_la return [o_b, in_h, in_w, o_c] @tvm.testing.fixture - def input_shape_padded(self, input_shape, padding, output_layout): + def input_shape_padded(self, input_shape, padding, output_layout, dtype): # Input shape is adjusted to account for 'padding'. Also, due to the physical # layout of the buffer, height and width are adjusted so that they are a - # multiple of 8 and 4 respectively. - # NOTE: Input layout is always assumed to be nhwc-8h2w32c2w-2d. + # multiple of the buffer size dictated by the layout. + # NOTE: For float16, the input layout is always assumed to be nhwc-8h2w32c2w-2d and + # for int8/uint8, it's nhwc-8h8w32c-2d. + # For both nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d, the height should be a multiple + # of 8. However, the width should be a multiple of 4 for the first case and 8 for + # the second case. + + height_mult = 8 + if dtype == "float16": + width_mult = 4 # input layout : nhwc-8h2w32c2w-2d + elif dtype in ("uint8", "int8"): + width_mult = 8 # input layout : nhwc-8h8w32c-2d + else: + raise RuntimeError(f"Unsupport dtype '{dtype}'") + pad_before_h, pad_before_w = padding[:2] pad_after_h, pad_after_w = padding[2:] - padded_input_height = ((input_shape[1] + pad_before_h + pad_after_h + 7) // 8) * 8 - padded_input_width = ((input_shape[2] + pad_before_w + pad_after_w + 3) // 4) * 4 + padded_input_height = ( + (input_shape[1] + pad_before_h + pad_after_h + height_mult - 1) // height_mult + ) * height_mult + padded_input_width = ( + (input_shape[2] + pad_before_w + pad_after_w + width_mult - 1) // width_mult + ) * width_mult return [input_shape[0], padded_input_height, padded_input_width, input_shape[3]] @tvm.testing.fixture @@ -332,80 +349,120 @@ def input_np_padded(self, input_np, input_shape, input_shape_padded, padding): ) return input_padded - @tvm.testing.requires_hexagon - def test_avg_pool2d_slice( + @tvm.testing.fixture + def schedule_args( self, stride, kernel, dtype, dilation, - padding, - count_include_pad, input_layout, output_layout, output_shape, - input_shape, input_shape_padded, - input_np, - input_np_padded, transformed_input_np_padded, transformed_expected_output_np, - expected_output_np, - hexagon_session: Session, ): - if hexagon_session._launcher._serial_number != "simulator": - pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11928") - - target_hexagon = tvm.target.hexagon("v69") + """ + Construct schedule args based on dtype + """ A = te.placeholder(input_shape_padded, name="A", dtype=dtype) - M = sl.avg_pool2d_compute(A, output_shape, kernel, stride, dilation) + if dtype == "float16": + M = sl.avg_pool2d_compute(A, kernel, stride, dilation, output_shape) + tir_schedule = sl.avg_pool2d_schedule(M, A, output_layout, input_layout) + elif dtype in ("uint8", "int8"): + in_data, in_scale, in_zero_point = transformed_input_np_padded + _, out_scale, out_zero_point = transformed_expected_output_np + M = qn.qnn_avg_pool2d_compute( + A, + kernel, + stride, + dilation, + output_shape, + dtype, + in_zero_point, + in_scale, + out_zero_point, + out_scale, + ) + tir_schedule = qn.qnn_avg_pool2d_schedule(M, A, output_layout, input_layout) - # tir schedule - tir_schedule = sl.avg_pool2d_STIR_schedule(M, A, output_layout, input_layout) - sch = tir_schedule.mod + return [tir_schedule.mod, [A, M]] - input_axis_separator = [4] - if output_layout == "nhwc-8h2w32c2w-2d": - output_axis_separator = [4] - elif output_layout == "n11c-1024c-2d": - output_axis_separator = [4] - else: - raise RuntimeError(f"Unexpected layout '{output_layout}'") + @tvm.testing.requires_hexagon + def test_avg_pool2d_slice( + self, + dtype, + output_layout, + output_shape, + transformed_input_np_padded, + transformed_expected_output_np, + schedule_args, + hexagon_session: Session, + ): + target_hexagon = tvm.target.hexagon("v69") + in_data = transformed_input_np_padded with tvm.transform.PassContext(opt_level=3): func = tvm.build( - sch, - [A, M], + *schedule_args, tvm.target.Target(target_hexagon, host=target_hexagon), name="avg_pool2d", ) + input_axis_separator = [4] + if output_layout in ( + "nhwc-8h2w32c2w-2d", + "nhwc-8h8w32c-2d", + "n11c-1024c-2d", + "n11c-2048c-2d", + ): + output_axis_separator = [4] + else: + raise RuntimeError(f"Unexpected layout '{output_layout}'") + + if dtype == "float16": + in_data_np = transformed_input_np_padded + out_data_np = transformed_expected_output_np + elif dtype in ("uint8", "int8"): + in_data_np, _, _ = transformed_input_np_padded + out_data_np, _, _ = transformed_expected_output_np + else: + raise RuntimeError(f"Unsupport dtype '{dtype}'") + input_arr = allocate_hexagon_array( hexagon_session.device, - data=transformed_input_np_padded, + data=in_data_np, axis_separators=input_axis_separator, mem_scope="global.vtcm", ) output_arr = allocate_hexagon_array( hexagon_session.device, - transformed_expected_output_np.shape, + out_data_np.shape, dtype, axis_separators=output_axis_separator, mem_scope="global.vtcm", ) mod = hexagon_session.load_module(func) + mod(input_arr, output_arr) b, h, w, c = output_shape if output_layout == "nhwc-8h2w32c2w-2d": output_np = output_arr.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2]) + elif output_layout == "nhwc-8h8w32c-2d": + output_np = output_arr.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32]) + elif output_layout == "n11c-2048c-2d": + output_np = output_arr.numpy().reshape([b, 1, 1, c // 2048, 2048]) elif output_layout == "n11c-1024c-2d": output_np = output_arr.numpy().reshape([b, 1, 1, c // 1024, 1024]) else: raise RuntimeError(f"Unexpected layout '{output_layout}'") - - np.testing.assert_allclose(output_np, transformed_expected_output_np, rtol=1e-3, atol=1e-3) + if dtype == "float16": + np.testing.assert_allclose(output_np, out_data_np, rtol=1e-3, atol=1e-3) + else: + np.testing.assert_allclose(output_np, out_data_np, rtol=1, atol=1) if __name__ == "__main__": From 17989e8ab519bdcc66014ccee42438f0dfd32023 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Wed, 24 Aug 2022 17:45:14 -0700 Subject: [PATCH 033/704] [microTVM] Fix `build` directory exists error (#12575) When you build a project from existing project directory using `tvm.micro.project.GeneratedProject.from_directory` it would show up error if build directory previously existed. --- apps/microtvm/zephyr/template_project/microtvm_api_server.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py index 38a7ec0c2939..76895c430bd6 100644 --- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py +++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py @@ -673,6 +673,8 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec tf.extractall(project_dir) def build(self, options): + if BUILD_DIR.exists(): + shutil.rmtree(BUILD_DIR) BUILD_DIR.mkdir() zephyr_board = _find_board_from_cmake_file(API_SERVER_DIR / CMAKELIST_FILENAME) From b8fbfe26ae3b5e323d2d85ffe02913d78bd0fd20 Mon Sep 17 00:00:00 2001 From: Yuchao Zhang <16538059+Lucien0@users.noreply.github.com> Date: Thu, 25 Aug 2022 08:46:08 +0800 Subject: [PATCH 034/704] [MicroTVM] fix compile error when the compiler implements char as unsigned (#12519) When compiling tvm with micro on the compiler which implements char as unsigned(such as arm-linux-gcc), there is an error: `src/runtime/crt/graph_executor/load_json.c:218:12: error: result of comparison of constant -1 with expression of type 'char' is always false [-Werror,-Wtautological-constant-out-of-range-compare]` ` if (ch == EOF || ch == '\r' || ch == '\n') {` The reason is because the implementation of char is undefined, so it's better to specify here that it is signed. --- src/runtime/crt/graph_executor/load_json.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/crt/graph_executor/load_json.c b/src/runtime/crt/graph_executor/load_json.c index f1c1f6768168..3d3cdb8d1ce9 100644 --- a/src/runtime/crt/graph_executor/load_json.c +++ b/src/runtime/crt/graph_executor/load_json.c @@ -177,7 +177,7 @@ char JSONReader_PeekNextNonSpace(JSONReader* reader) { */ int JSONReader_ReadString(JSONReader* reader, char* out_str, size_t out_str_size) { int status = 0; - char ch = reader->NextNonSpace(reader); + int ch = reader->NextNonSpace(reader); size_t output_counter = 0; while (output_counter < out_str_size || out_str == NULL) { ch = reader->NextChar(reader); From cd8fd9121deb22b078c9fe73cd8a554e6e7a0e15 Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Wed, 24 Aug 2022 19:21:35 -0700 Subject: [PATCH 035/704] [TIR] Expose `shift_left` and `shift_right` to Python (#12584) This PR exposes the following TIR operation in python: - `shift_left`: tested [here](https://github.com/apache/tvm/blob/1afd0593956066635ee49297b731726c9218c91c/tests/python/unittest/test_tir_transform_simplify.py#L487) - `shift_right`: add new unittest Co-authored-by: yongwww --- python/tvm/tir/__init__.py | 2 +- python/tvm/tir/op.py | 38 ++++++++++++++++++++++ tests/python/unittest/test_tir_op_types.py | 16 +++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py index f61e05cc92e9..94efe6e1abfe 100644 --- a/python/tvm/tir/__init__.py +++ b/python/tvm/tir/__init__.py @@ -63,7 +63,7 @@ from .op import likely, isnan, isnullptr, isfinite, isinf, copysign from .op import div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod, ceildiv from .op import comm_reducer, min, max, sum -from .op import q_multiply_shift +from .op import q_multiply_shift, shift_left, shift_right from .op import TVMBackendAllocWorkspace, TVMBackendFreeWorkspace from .schedule import StmtSRef, BlockScope, ScheduleState, Schedule, ScheduleError diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py index c4618042b2dc..4f26b0f94765 100644 --- a/python/tvm/tir/op.py +++ b/python/tvm/tir/op.py @@ -1604,6 +1604,44 @@ def q_multiply_shift(x, y, q, s): return call_intrin("int32", "tir.q_multiply_shift", x, y, q, s) +def shift_left(x, y, span=None): + """Return the result of x left shifted by y bits. + + Parameters + ---------- + x : PrimExpr + Input argument. + + y : PrimExpr + Input argument. + + Returns + ------- + z : PrimExpr + The result. + """ + return _ffi_api.left_shift(x, y, span) + + +def shift_right(x, y, span=None): + """Return the result of x right shifted by y bits. + + Parameters + ---------- + x : PrimExpr + Input argument. + + y : PrimExpr + Input argument. + + Returns + ------- + z : PrimExpr + The result. + """ + return _ffi_api.right_shift(x, y, span) + + def fmod(x, y): """Return the remainder of x divided by y with the same sign as x. diff --git a/tests/python/unittest/test_tir_op_types.py b/tests/python/unittest/test_tir_op_types.py index 3f0ec37adb85..835a397ee3b2 100644 --- a/tests/python/unittest/test_tir_op_types.py +++ b/tests/python/unittest/test_tir_op_types.py @@ -125,6 +125,20 @@ def test_tir_op_vectorcombine(): assert expr.op.name == "tir.vectorcombine" +def test_tir_op_shift_left(): + x = tir.Var("x", dtype="int32") + y = tir.Var("x", dtype="int32") + expr = tir.shift_left(x, y) + assert expr.op.name == "tir.shift_left" + + +def test_tir_op_shift_right(): + x = tir.Var("x", dtype="int32") + y = tir.Var("x", dtype="int32") + expr = tir.shift_right(x, y) + assert expr.op.name == "tir.shift_right" + + def test_tir_op_TVMBackendAllocWorkspace(): expr = tir.TVMBackendAllocWorkspace(0, 1, 2, 3, 4) assert expr.op.name == "tir.TVMBackendAllocWorkspace" @@ -154,5 +168,7 @@ def test_tir_op_TVMBackendFreeWorkspace(): test_tir_op_vectorlow() test_tir_op_vectorhigh() test_tir_op_vectorcombine() + test_tir_op_shift_left() + test_tir_op_shift_right() test_tir_op_TVMBackendAllocWorkspace() test_tir_op_TVMBackendFreeWorkspace() From 9aac161a46e5aca4c433ccb901c1bb84e6c8bd0c Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Wed, 24 Aug 2022 23:28:54 -0700 Subject: [PATCH 036/704] [MetaSchedule] Add software pipeline in CUDA tensor core auto tensorization (#12544) cc @Hzfengsy @junrushao @junrushao1994 @masahi @spectrometerHBH --- include/tvm/meta_schedule/schedule_rule.h | 3 +- python/tvm/meta_schedule/default_config.py | 1 + .../schedule_rule/multi_level_tiling.py | 4 + .../meta_schedule/testing/schedule_rule.py | 2 + .../multi_level_tiling_tensor_core.cc | 122 ++++++++++++++++- ...hedule_schedule_rule_multi_level_tiling.py | 125 ++++++++++++++++++ 6 files changed, 255 insertions(+), 2 deletions(-) diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h index b5f4a17b698d..2da441c95e0b 100644 --- a/include/tvm/meta_schedule/schedule_rule.h +++ b/include/tvm/meta_schedule/schedule_rule.h @@ -190,13 +190,14 @@ class ScheduleRule : public runtime::ObjectRef { * NullOpt means disable vectorization * \param reuse_read Data reuse configuration for reading. NullOpt means no reuse. * \param reuse_write Data reuse configuration for writing. NullOpt means no reuse. + * \param use_software_pipeline Whether use the software pipeline. * \return The schedule rule created */ TVM_DLL static ScheduleRule MultiLevelTilingTensorCore( Array> intrin_groups, String structure, Optional> tile_binds, Optional max_innermost_factor, Optional> vector_load_lens, Optional> reuse_read, - Optional> reuse_write); + Optional> reuse_write, bool use_software_pipeline); /*! * \brief Create a rule: add-rfactor to some blocks if needed diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py index 105b3467de0e..0f1f7d3c2c6a 100644 --- a/python/tvm/meta_schedule/default_config.py +++ b/python/tvm/meta_schedule/default_config.py @@ -381,6 +381,7 @@ def schedule_rules(): levels=[2], scope="shared", ), + use_software_pipeline=False, ), *_DefaultCUDA.schedule_rules(), ] diff --git a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py index a728a91eb74e..6703bc5716e9 100644 --- a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py +++ b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py @@ -161,6 +161,8 @@ class MultiLevelTilingTensorCore(ScheduleRule): Data reuse configuration for reading. None means no reuse. reuse_write : Optional[ReuseType] Data reuse configuration for writing. None means no reuse. + use_software_pipeline : bool + Whether to use the software pipeline. """ def __init__( @@ -172,6 +174,7 @@ def __init__( vector_load_lens: Optional[List[int]] = None, reuse_read: Optional[ReuseType] = None, reuse_write: Optional[ReuseType] = None, + use_software_pipeline: bool = False, ) -> None: self.__init_handle_by_constructor__( _ffi_api.ScheduleRuleMultiLevelTilingTensorCore, # type: ignore # pylint: disable=no-member @@ -182,4 +185,5 @@ def __init__( vector_load_lens, reuse_read.as_dict() if reuse_read is not None else None, reuse_write.as_dict() if reuse_write is not None else None, + use_software_pipeline, ) diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py index 441ca930f858..46df4b95ce07 100644 --- a/python/tvm/meta_schedule/testing/schedule_rule.py +++ b/python/tvm/meta_schedule/testing/schedule_rule.py @@ -119,6 +119,7 @@ def multi_level_tiling_tensor_core( in_dtype: Union[str, List[str]] = "float16", out_dtype: Union[str, List[str]] = "float32", trans_b: Union[bool, List[bool]] = False, + use_software_pipeline: bool = False, ) -> ScheduleRule: """Default schedule rules for with multi-level tiling reuse for tensor core""" assert write_reuse_scope in ["shared", "global"] @@ -154,6 +155,7 @@ def multi_level_tiling_tensor_core( levels=[2], scope=write_reuse_scope, ), + use_software_pipeline=use_software_pipeline, ) raise NotImplementedError(f"{target.kind.name} is not supported") diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc index 7a3ec513db84..49704fb66b15 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc @@ -128,6 +128,8 @@ class MultiLevelTilingTensorCoreNode : public MultiLevelTilingNode { inline std::vector AddReadReuseTensorCore(TensorCoreState state) const; // Subrule: Add tensorized store inline std::vector AddWriteReuseTensorCore(TensorCoreState state) const; + // Subrule: Add software pipeline + inline std::vector AddSoftwarePipeline(TensorCoreState state) const; // Override ApplySubRules to apply tensorization-specific sub-rules std::vector ApplySubRules(std::vector states) final; @@ -155,6 +157,8 @@ class MultiLevelTilingTensorCoreNode : public MultiLevelTilingNode { public: /*! \brief The candidate tensor core intrin groups to apply */ std::vector intrin_groups; + /*! \brief Whether to use software pipeline */ + bool use_software_pipeline = false; static constexpr const char* _type_key = "meta_schedule.MultiLevelTilingTensorCore"; TVM_DECLARE_FINAL_OBJECT_INFO(MultiLevelTilingTensorCoreNode, MultiLevelTilingNode); @@ -222,6 +226,9 @@ std::vector MultiLevelTilingTensorCoreNode::ApplySubRules(std::vector(state)); }); + states = SubRule(std::move(states), [&](State state) { + return AddSoftwarePipeline(Downcast(state)); + }); return states; } @@ -286,6 +293,117 @@ std::vector MultiLevelTilingTensorCoreNode::AddReadReuseTensorCore( return {state}; } +std::vector MultiLevelTilingTensorCoreNode::AddSoftwarePipeline( + TensorCoreState state) const { + if (!use_software_pipeline) { + return {state}; + } + // The current config is not suitable for software pipelining. + if (r_indices_.size() < 2) { + return {state}; + } + + Schedule& sch = state->sch; + // Check reduction length after blockize. + int64_t reduction_length = 1; + for (int r_index : r_indices_) { + const Array& tiles = state->tiles[r_index]; + for (const LoopRV& tile : tiles) { + const auto* extent = sch->Get(tile)->extent.as(); + ICHECK(extent != nullptr) << "Dynamic extent is not supported."; + reduction_length *= extent->value; + } + } + if (reduction_length <= 1) { + return {state}; + } + + // Add local stage and double buffering + for (int i = 0; i < 2; ++i) { + const tir::BlockRV cache_read = state->read_reuse.at(i); + sch->Annotate(cache_read, tir::attr::manifest_shared_memory_local_stage, Bool(true)); + sch->Annotate(cache_read, tir::attr::double_buffer_scope, Integer(0)); + } + + // Add annotations of software pipeline + // + // Before pipelining, the original loop can be expressed as the pseudo code below: + // + // for k0 in [0, K0): + // load tile k0 to registers + // load tile k0 from registers to shared memory + // + // for k1 in [0, K1): + // load fragment k1 of tile k0 + // compute matmul with fragment k1 + // + + // Inner software pipeline: Prefetch to tensor core fragment by one iteration + // The following annotation for the inner loop is equivalent the pesudo code below: + // + // Pipelined inner loop: + // + // prologue: + // load fragment 0 + // body: + // for k1 in [0, K1 - 1): + // load fragment k1 + 1 + // compute matmul with fragment k1 + // epilogue: + // compute matmul with fragment K1 - 1 + // + sch->Annotate(state->tiles[r_indices_[1]].back(), tir::attr::software_pipeline_stage, + Array{0, 0, 1}); + sch->Annotate(state->tiles[r_indices_[1]].back(), tir::attr::software_pipeline_order, + Array{0, 1, 2}); + // Outer software pipeline: Interleave the outer loop with the (pipelined) inner loop. + // The prefetching stage of the inner pipeline is executed by one iteration in the outer loop. + // The following annotation for the outer loop is equivalent the pesudo code below: + // + // Pipelined outer loop with nested inner pipeline: + // + // prologue: + // load tile 0 to registers + // load tile 0 from registers to shared memory + // + // // prologue of the inner pipeline + // load fragment 0 of tile 0 + // + // body: + // for k0 in [0, K0 - 1): + // load tile k0 + 1 to registers + // + // // body of the inner pipeline + // for k1 in [0, K1 - 1): + // load fragment k1 + 1 of tile k0 + // compute matmul with fragment k1 of tile k0 + // + // load tile k0 + 1 from registers to shared memory + // + // // prologue of the inner pipeline + // load fragment 0 of tile k0 + 1 + // + // // epilogue of the inner pipeline + // compute matmul with fragment K1 - 1 of tile k0 + // + // epilogue: + // + // // body of the inner pipeline + // for k1 in [0, K1 - 1): + // load fragment k1 + 1 of tile K0 - 1 + // compute matmul with fragment k1 of tile K0 - 1 + // + // // epilogue of the inner pipeline + // compute matmul with fragment K1 - 1 of tile K0 - 1 + // + sch->Annotate(state->tiles[r_indices_[0]].back(), tir::attr::software_pipeline_stage, + Array{0, 0, 0, 0, 0, 1, 1}); + sch->Annotate(state->tiles[r_indices_[0]].back(), tir::attr::software_pipeline_order, + Array{0, 3, 1, 4, 5, 2, 6}); + + return {state}; +} + Optional MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin( TensorCoreStateNode* state, const String& intrin_name) const { BlockRV block_rv = state->block_rv; @@ -418,7 +536,8 @@ inline std::vector MultiLevelTilingTensorCoreNode::TransformForTensorizat ScheduleRule ScheduleRule::MultiLevelTilingTensorCore( Array> intrin_groups, String structure, Optional> tile_binds, Optional max_innermost_factor, Optional> vector_load_lens, - Optional> reuse_read, Optional> reuse_write) { + Optional> reuse_read, Optional> reuse_write, + bool use_software_pipeline) { auto node = MultiLevelTilingInitCommon( structure, tile_binds, max_innermost_factor, vector_load_lens, reuse_read, reuse_write); @@ -426,6 +545,7 @@ ScheduleRule ScheduleRule::MultiLevelTilingTensorCore( for (const auto& intrin_group_config : intrin_groups) { node->intrin_groups.emplace_back(TensorCoreIntrinGroup::FromConfig(intrin_group_config)); } + node->use_software_pipeline = use_software_pipeline; return ScheduleRule(node); } diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py index 4da870e455d3..87159fcb3110 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py @@ -709,6 +709,131 @@ def test_cuda_tensor_core_matmul_relu(): check_trace(spaces, expected) +def test_cuda_tensor_core_software_pipeline_matmul_relu(): + m = n = k = 128 + target = Target("cuda", host="llvm") + ctx = _create_context( + create_prim_func( + te_workload.matmul_relu( + n=n, + m=m, + k=k, + in_dtype="float16", + out_dtype="float32", + ) + ), + target=target, + rule=[ + multi_level_tiling_tensor_core( + target=target, write_reuse_scope="shared", use_software_pipeline=True + ), + auto_inline(target), + ], + ) + spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) + assert len(spaces) == 1 + + expected = [ + """b0 = sch.get_block(name="C", func_name="main") +b1 = sch.get_block(name="compute", func_name="main") +sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") +b2 = sch.reindex(block=b0, buffer=("write", 0)) +b3 = sch.reindex(block=b0, buffer=("read", 0)) +b4 = sch.reindex(block=b0, buffer=("read", 1)) +sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, )) +sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (k, j, )) +sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, )) +sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, )) +sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, )) +sch.transform_block_layout(block=b4, index_map=lambda i, j, k: (i, j, k, )) +sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, )) +l5, l6, l7 = sch.get_loops(block=b0) +l8, l9 = sch.split(loop=l7, factors=[None, 16], preserve_unit_iters=True) +l10, l11 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True) +l12, l13 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True) +l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0) +sch.reorder(l16, l18, l13, l11, l9) +b20 = sch.blockize(loop=l13) +sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32") +sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32") +sch.annotate(block_or_loop=b20, ann_key="warp_execution", ann_val=1) +l21, l22, l23 = sch.get_loops(block=b20) +v24, v25, v26, v27, v28 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4) +l29, l30, l31, l32, l33 = sch.split(loop=l21, factors=[v24, v25, v26, v27, v28], preserve_unit_iters=True) +v34, v35, v36, v37, v38 = sch.sample_perfect_tile(loop=l22, n=5, max_innermost_factor=4) +l39, l40, l41, l42, l43 = sch.split(loop=l22, factors=[v34, v35, v36, v37, v38], preserve_unit_iters=True) +v44, v45, v46 = sch.sample_perfect_tile(loop=l23, n=3, max_innermost_factor=4) +l47, l48, l49 = sch.split(loop=l23, factors=[v44, v45, v46], preserve_unit_iters=True) +sch.reorder(l29, l39, l30, l40, l31, l41, l47, l48, l32, l42, l49, l33, l43) +l50 = sch.fuse(l29, l39, preserve_unit_iters=True) +sch.bind(loop=l50, thread_axis="blockIdx.y") +l51 = sch.fuse(l30, l40, preserve_unit_iters=True) +sch.bind(loop=l51, thread_axis="blockIdx.x") +l52 = sch.fuse(l31, l41, preserve_unit_iters=True) +sch.bind(loop=l52, thread_axis="threadIdx.y") +b53 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="shared") +sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True) +b54 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="wmma.accumulator") +sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True) +v55 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25]) +sch.annotate(block_or_loop=b53, ann_key="meta_schedule.cooperative_fetch", ann_val=v55) +sch.reverse_compute_inline(block=b2) +l56, l57, l58, l59, l60 = sch.get_loops(block=b54) +l61, l62 = sch.split(loop=l60, factors=[None, 16], preserve_unit_iters=True) +l63, l64 = sch.split(loop=l59, factors=[None, 16], preserve_unit_iters=True) +l65, l66, l67, l68, l69, l70, l71 = sch.get_loops(block=b54) +sch.reorder(l70, l64, l62) +b72 = sch.blockize(loop=l64) +sch.annotate(block_or_loop=b72, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared") +b73 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="shared") +sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True) +l74, l75, l76, l77, l78, l79 = sch.get_loops(block=b73) +l80 = sch.fuse(l78, l79, preserve_unit_iters=True) +v81 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) +sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v81) +b82 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="shared") +sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True) +l83, l84, l85, l86, l87, l88 = sch.get_loops(block=b82) +l89 = sch.fuse(l87, l88, preserve_unit_iters=True) +v90 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) +sch.annotate(block_or_loop=b82, ann_key="meta_schedule.cooperative_fetch", ann_val=v90) +b91 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="wmma.matrix_a") +sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True) +l92, l93, l94, l95, l96, l97, l98 = sch.get_loops(block=b91) +l99, l100 = sch.split(loop=l98, factors=[None, 16], preserve_unit_iters=True) +l101, l102 = sch.split(loop=l97, factors=[None, 16], preserve_unit_iters=True) +l103, l104, l105, l106, l107, l108, l109, l110, l111 = sch.get_loops(block=b91) +sch.reorder(l110, l102, l100) +b112 = sch.blockize(loop=l102) +sch.annotate(block_or_loop=b112, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a") +b113 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="wmma.matrix_b") +sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True) +l114, l115, l116, l117, l118, l119, l120 = sch.get_loops(block=b113) +l121, l122 = sch.split(loop=l120, factors=[None, 16], preserve_unit_iters=True) +l123, l124 = sch.split(loop=l119, factors=[None, 16], preserve_unit_iters=True) +l125, l126, l127, l128, l129, l130, l131, l132, l133 = sch.get_loops(block=b113) +sch.reorder(l132, l124, l122) +b134 = sch.blockize(loop=l124) +sch.annotate(block_or_loop=b134, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b") +sch.compute_inline(block=b3) +sch.compute_inline(block=b4) +sch.storage_align(block=b73, buffer_index=0, axis=-2, factor=32, offset=8) +sch.storage_align(block=b82, buffer_index=0, axis=-2, factor=32, offset=8) +sch.annotate(block_or_loop=b73, ann_key="tir.manifest_shared_memory_local_stage", ann_val=1) +sch.annotate(block_or_loop=b73, ann_key="double_buffer_scope", ann_val=0) +sch.annotate(block_or_loop=b82, ann_key="tir.manifest_shared_memory_local_stage", ann_val=1) +sch.annotate(block_or_loop=b82, ann_key="double_buffer_scope", ann_val=0) +sch.annotate(block_or_loop=l48, ann_key="software_pipeline_stage", ann_val=[0, 0, 1]) +sch.annotate(block_or_loop=l48, ann_key="software_pipeline_order", ann_val=[0, 1, 2]) +sch.annotate(block_or_loop=l47, ann_key="software_pipeline_stage", ann_val=[0, 0, 0, 0, 0, 1, 1]) +sch.annotate(block_or_loop=l47, ann_key="software_pipeline_order", ann_val=[0, 3, 1, 4, 5, 2, 6]) +sch.reverse_compute_inline(block=b1)""".split( + "\n" + ) + ] + check_trace(spaces, expected) + + def test_cuda_tensor_core_matmul_relu_global(): m = n = k = 128 target = Target("cuda", host="llvm") From b38738434b13e138916c994b326b5a128ed14004 Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Thu, 25 Aug 2022 03:03:27 -0700 Subject: [PATCH 037/704] [TIR] Expose WMMA-related TensorCore builtins (#12589) This PR exposes the following TIR operation in python: `tvm_load_matrix_sync`: tested [here](https://github.com/apache/tvm/blob/cd8fd9121deb22b078c9fe73cd8a554e6e7a0e15/tests/python/unittest/test_tvmscript_roundtrip.py#L711) `tvm_store_matrix_sync`: tested [here](https://github.com/apache/tvm/blob/cd8fd9121deb22b078c9fe73cd8a554e6e7a0e15/tests/python/unittest/test_tvmscript_roundtrip.py#L913) `tvm_mma_sync`: tested [here](https://github.com/apache/tvm/blob/cd8fd9121deb22b078c9fe73cd8a554e6e7a0e15/tests/python/unittest/test_tvmscript_roundtrip.py#L860) `tvm_bmma_sync`: add new unittest `tvm_fill_fragment`: tested [here](https://github.com/apache/tvm/blob/cd8fd9121deb22b078c9fe73cd8a554e6e7a0e15/tests/python/unittest/test_tvmscript_roundtrip.py#L571) Co-authored-by: yongwww cc: @junrushao cc @Hzfengsy @junrushao1994 Co-authored-by: yongwww --- python/tvm/tir/__init__.py | 7 + python/tvm/tir/op.py | 236 +++++++++++++++++++++ tests/python/unittest/test_tir_op_types.py | 43 ++++ 3 files changed, 286 insertions(+) diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py index 94efe6e1abfe..04ab7f80daa9 100644 --- a/python/tvm/tir/__init__.py +++ b/python/tvm/tir/__init__.py @@ -52,6 +52,13 @@ from .op import tvm_tuple, tvm_struct_get, tvm_struct_set from .op import address_of, lookup_param, assume, undef from .op import tvm_thread_allreduce, type_annotation, tvm_access_ptr, tvm_throw_last_error +from .op import ( + tvm_load_matrix_sync, + tvm_store_matrix_sync, + tvm_mma_sync, + tvm_bmma_sync, + tvm_fill_fragment, +) from .op import vectorlow, vectorhigh, vectorcombine from .op import infinity, reinterpret from .op import exp, exp2, exp10, log, log2, log10, log1p, ldexp, clz diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py index 4f26b0f94765..cf7985e8f489 100644 --- a/python/tvm/tir/op.py +++ b/python/tvm/tir/op.py @@ -595,6 +595,242 @@ def tvm_throw_last_error(): return call_intrin("handle", "tir.tvm_throw_last_error") +def tvm_load_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout): + """TVM intrinsic for tensor core load operators + + Parameters + ---------- + fragment : Var + The wmma fragment. + + m : UIntImm + The shape of wmma fragment. + + n : UIntImm + The shape of wmma fragment. + + k : UIntImm + The shape of wmma fragment. + + index : Expr + The fragment index. + + buffer_ptr : Expr + The fragment buffer pointer. + + stride : Expr + The fragment stride. + + layout : Literal["row_major", "column_major"] + The fragment layout. + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin( + "handle", + "tir.tvm_load_matrix_sync", + fragment, + m, + n, + k, + index, + buffer_ptr, + stride, + layout, + ) + + +def tvm_mma_sync( + fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c +): + """TVM intrinsic for tensor core mma_sync operators + + Parameters + ---------- + fragment_d : Var + The wmma fragment_d. + + index_d : Expr + The fragment_d index. + + fragment_a : Var + The wmma fragment_a. + + index_a : Expr + The fragment_a index. + + fragment_b : Var + The wmma fragment_b. + + index_b : Expr + The fragment_b index. + + fragment_c : Var + The wmma fragment_c. + + index_c : Expr + The fragment_c index. + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin( + "handle", + "tir.tvm_mma_sync", + fragment_d, + index_d, + fragment_a, + index_a, + fragment_b, + index_b, + fragment_c, + index_c, + ) + + +def tvm_bmma_sync( + fragment_d, index_d, fragment_a, index_a, fragment_b, index_b, fragment_c, index_c +): + """TVM intrinsic for tensor core bmma_sync operators + + Parameters + ---------- + fragment_d : Var + The bwmma fragment_d. + + index_d : Expr + The fragment_d index. + + fragment_a : Var + The bwmma fragment_a. + + index_a : Expr + The fragment_a index. + + fragment_b : Var + The bwmma fragment_b. + + index_b : Expr + The fragment_b index. + + fragment_c : Var + The bwmma fragment_c. + + index_c : Expr + The fragment_c index. + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin( + "handle", + "tir.tvm_bmma_sync", + fragment_d, + index_d, + fragment_a, + index_a, + fragment_b, + index_b, + fragment_c, + index_c, + ) + + +def tvm_fill_fragment(fragment, m, n, k, index, value): + """TVM intrinsic for tensor core fill_fragment operators + + Parameters + ---------- + fragment : Var + The wmma fragment + + m : UIntImm + The shape of wmma fragment. + + n : UIntImm + The shape of wmma fragment. + + k : UIntImm + The shape of wmma fragment. + + index : Expr + The fragment index. + + value : Expr + The value to be filled in fragment. + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin( + "handle", + "tir.tvm_fill_fragment", + fragment, + m, + n, + k, + index, + value, + ) + + +def tvm_store_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout): + """TVM intrinsic for tensor core store operators + + Parameters + ---------- + fragment : Var + The wmma fragment. + + m : UIntImm + The shape of wmma fragment. + + n : UIntImm + The shape of wmma fragment. + + k : UIntImm + The shape of wmma fragment. + + index : Expr + The fragment index. + + buffer_ptr : Expr + The fragment buffer pointer. + + stride : Expr + The fragment stride. + + layout : Literal["row_major", "column_major"] + The fragment layout. + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin( + "handle", + "tir.tvm_store_matrix_sync", + fragment, + m, + n, + k, + index, + buffer_ptr, + stride, + layout, + ) + + def vectorlow(dtype, vec): """Get the low level half of the vector diff --git a/tests/python/unittest/test_tir_op_types.py b/tests/python/unittest/test_tir_op_types.py index 835a397ee3b2..5254e7326e24 100644 --- a/tests/python/unittest/test_tir_op_types.py +++ b/tests/python/unittest/test_tir_op_types.py @@ -104,6 +104,44 @@ def test_tir_op_tvm_throw_last_error(): assert expr.op.name == "tir.tvm_throw_last_error" +def test_tir_op_tvm_load_matrix_sync(): + buffer = tir.decl_buffer((16, 16), "float32") + x = tir.Var("x", "handle") + expr = tir.tvm_load_matrix_sync(buffer.data, 16, 16, 16, 0, x, 128, "row_major") + assert expr.op.name == "tir.tvm_load_matrix_sync" + + +def test_tir_op_tvm_store_matrix_sync(): + buffer = tir.decl_buffer((16, 16), "float32") + x = tir.Var("x", "handle") + expr = tir.tvm_store_matrix_sync(buffer.data, 16, 16, 16, 0, x, 128, "row_major") + assert expr.op.name == "tir.tvm_store_matrix_sync" + + +def test_tir_op_tvm_mma_sync(): + buffer_0 = tir.decl_buffer((16, 16), "float32") + buffer_1 = tir.decl_buffer((16, 16), "float32") + buffer_2 = tir.decl_buffer((16, 16), "float32") + buffer_3 = tir.decl_buffer((16, 16), "float32") + expr = tir.tvm_mma_sync(buffer_0.data, 0, buffer_1.data, 0, buffer_2.data, 0, buffer_3.data, 0) + assert expr.op.name == "tir.tvm_mma_sync" + + +def test_tir_op_tvm_bmma_sync(): + buffer_0 = tir.decl_buffer((16, 16), "float32") + buffer_1 = tir.decl_buffer((16, 16), "float32") + buffer_2 = tir.decl_buffer((16, 16), "float32") + buffer_3 = tir.decl_buffer((16, 16), "float32") + expr = tir.tvm_bmma_sync(buffer_0.data, 0, buffer_1.data, 0, buffer_2.data, 0, buffer_3.data, 0) + assert expr.op.name == "tir.tvm_bmma_sync" + + +def test_tir_op_tvm_fill_fragment(): + buffer = tir.decl_buffer((16, 16), "float32") + expr = tir.tvm_fill_fragment(buffer.data, 16, 16, 16, 0, 0) + assert expr.op.name == "tir.tvm_fill_fragment" + + def test_tir_op_vectorlow(): buffer = tir.decl_buffer((4, 4), "int8", offset_factor=1) vec = buffer.vload([0, 0], dtype="int8x16") @@ -165,6 +203,11 @@ def test_tir_op_TVMBackendFreeWorkspace(): test_tir_op_type_annotation() test_tir_op_tvm_access_ptr() test_tir_op_tvm_throw_last_error() + test_tir_op_tvm_load_matrix_sync(), + test_tir_op_tvm_store_matrix_sync(), + test_tir_op_tvm_mma_sync(), + test_tir_op_tvm_bmma_sync(), + test_tir_op_tvm_fill_fragment(), test_tir_op_vectorlow() test_tir_op_vectorhigh() test_tir_op_vectorcombine() From 40bdea8d7ae1109e33ac64265b4819bb8ebef8b3 Mon Sep 17 00:00:00 2001 From: Yuanjing Shi Date: Thu, 25 Aug 2022 00:04:07 -1000 Subject: [PATCH 038/704] [PyTorch] Add aten::new_empty (#12591) This PR intends to add `aten::new_empty` which is used for model like `hf_Longformer`. cc: @masahi --- python/tvm/relay/frontend/pytorch.py | 16 ++++++++++++++++ tests/python/frontend/pytorch/test_forward.py | 17 +++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index 04a25c86b799..9f808203a6e1 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -2506,6 +2506,21 @@ def empty_like(self, inputs, input_types): dtype = input_types[0] return _op.zeros(shape, dtype) + def new_empty(self, inputs, input_types): + size = inputs[1] + + import torch + + if not isinstance(size, (_expr.Expr, list, tuple, torch.Size, np.ndarray)): + msg = "Data type %s could not be parsed in empty op" % (type(size)) + raise AssertionError(msg) + + if inputs[2] is not None: + dtype = _convert_dtype_value(inputs[2]) + else: + dtype = input_types[0] + return _op.zeros(size, dtype) + def randn(self, inputs, input_types): import time # use current time as seed @@ -3639,6 +3654,7 @@ def create_convert_map(self): "aten::numel": self.numel, "aten::empty": self.empty, "aten::empty_like": self.empty_like, + "aten::new_empty": self.new_empty, "aten::randn": self.randn, "aten::bincount": self.bincount, "aten::scatter_add": self.scatter_add, diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 7e00770cd593..2d0a476e372d 100755 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -4162,6 +4162,23 @@ def test_func(data): verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()], assert_shape_only=True) +@tvm.testing.uses_gpu +def test_new_empty(): + """test_forward_new_ones""" + torch.set_grad_enabled(False) + input_shape = [1, 3, 10, 10] + + def test_func(input_tensor): + return input_tensor.new_empty([3, 10, 10]) + + verify_model_with_input(test_func, [torch.rand(input_shape).float()], assert_shape_only=True) + + def test_func1(input_tensor): + return input_tensor.new_empty([3, 10, 10], dtype=torch.int32) + + verify_model_with_input(test_func1, [torch.rand(input_shape).float()], assert_shape_only=True) + + def test_randn(): """Test for aten::randn""" From fb7cf97fbc2cc19a7eea879a3a1598780f6aa6aa Mon Sep 17 00:00:00 2001 From: masahi Date: Thu, 25 Aug 2022 20:05:45 +0900 Subject: [PATCH 039/704] [CI] Install xgboost in Hexagon image (#12592) Needed for https://github.com/apache/tvm/pull/12587 @mehrdadh cc @Mousius @areusch @driazati @gigiblender --- docker/Dockerfile.ci_hexagon | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon index cf7407c2ab05..66b78ae0800c 100644 --- a/docker/Dockerfile.ci_hexagon +++ b/docker/Dockerfile.ci_hexagon @@ -83,3 +83,7 @@ RUN bash /install/ubuntu_install_tflite.sh # Install ONNX COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh RUN bash /install/ubuntu_install_onnx.sh + +# xgboost (for tuning) +COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh +RUN bash /install/ubuntu_install_redis.sh From cc19cdd711b620582baacff82318d3adf5b15115 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Thu, 25 Aug 2022 07:22:37 -0700 Subject: [PATCH 040/704] [microTVM][Zephyr] Add recommended heap size for NRF and qemu_x86 (#12585) This PR sets recommended heap size for qemu_x86 and NRF board to fix memory size with models like VWW using AoT host driven executor. --- apps/microtvm/zephyr/template_project/boards.json | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/apps/microtvm/zephyr/template_project/boards.json b/apps/microtvm/zephyr/template_project/boards.json index dcca9c800224..28cbee54d602 100644 --- a/apps/microtvm/zephyr/template_project/boards.json +++ b/apps/microtvm/zephyr/template_project/boards.json @@ -38,7 +38,8 @@ "is_qemu": false, "fpu": true, "vid_hex": "1366", - "pid_hex": "1055" + "pid_hex": "1055", + "recommended_heap_size_bytes": 368640 }, "nucleo_f746zg": { "board": "nucleo_f746zg", @@ -55,7 +56,7 @@ "fpu": true, "vid_hex": "0483", "pid_hex": "374b", - "recommended_heap_size_bytes": 512000 + "recommended_heap_size_bytes": 524288 }, "qemu_cortex_r5": { "board": "qemu_cortex_r5", @@ -87,7 +88,8 @@ "is_qemu": true, "fpu": true, "vid_hex": "", - "pid_hex": "" + "pid_hex": "", + "recommended_heap_size_bytes": 524288 }, "stm32f746g_disco": { "board": "stm32f746g_disco", From 56b7c8ae9676ad2184443b60e0c795672e2b6fc9 Mon Sep 17 00:00:00 2001 From: Florin Blanaru Date: Thu, 25 Aug 2022 16:43:06 +0100 Subject: [PATCH 041/704] [CI] Assert some unittests are not skipped in CI (#12436) This PR adds a script that does a diff of skipped tests between the latest successful build on the main and the current branch. Then, it posts a comment with the report on the open PR. #11670 --- .github/workflows/tests_bot.yml | 21 ++ tests/python/ci/test_ci.py | 179 ++++++++++++ tests/scripts/github_skipped_tests_comment.py | 256 ++++++++++++++++++ 3 files changed, 456 insertions(+) create mode 100644 .github/workflows/tests_bot.yml create mode 100755 tests/scripts/github_skipped_tests_comment.py diff --git a/.github/workflows/tests_bot.yml b/.github/workflows/tests_bot.yml new file mode 100644 index 000000000000..e9d7d81375e4 --- /dev/null +++ b/.github/workflows/tests_bot.yml @@ -0,0 +1,21 @@ + +name: tests-bot +on: + status +jobs: + run-tests-bot: + if: ${{ github.repository == 'apache/tvm' && github.event.state == 'success' && github.event.context == 'tvm-ci/pr-head' }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Comment skipped tests + env: + AWS_ACCESS_KEY_ID: ${{ secrets.CI_RESOURCES_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.CI_RESOURCES_AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-west-2 + COMMIT_SHA: ${{ github.event.sha }} + TARGET_URL: ${{ github.event.target_url }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -eux + python tests/scripts/github_skipped_tests_comment.py \ No newline at end of file diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py index 1e2008fdd7ba..c45a0d8d8ee0 100644 --- a/tests/python/ci/test_ci.py +++ b/tests/python/ci/test_ci.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. """Test various CI scripts and GitHub Actions workflows""" +import shutil import subprocess import json import textwrap @@ -33,6 +34,184 @@ def parameterize_named(*values): return pytest.mark.parametrize(",".join(keys), [tuple(d.values()) for d in values]) +# pylint: disable=line-too-long +TEST_DATA_SKIPPED_BOT = { + "found-diff": { + "main_xml_file": "unittest/file1.xml", + "main_xml_content": """ + + + + + + + """, + "pr_xml_file": "unittest/file2.xml", + "pr_xml_content": """ + + + + + Skipped + + + + + Skipped + + + + + """, + "target_url": "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect", + "s3_prefix": "tvm-jenkins-artifacts-prod", + "jenkins_prefix": "ci.tlcpack.ai", + "common_main_build": """{"build_number": "4115", "state": "success"}""", + "commit_sha": "SHA", + "expected_url": "issues/11594/comments", + "expected_body": """\n\nThe list below shows some tests that ran in main SHA but were skipped in the CI build of SHA:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\nunittest -> ctypes.tests.python.unittest.test_roofline#test_estimate_peak_bandwidth[cuda]\n```\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).""", + }, + "no-diff": { + "main_xml_file": "unittest/file1.xml", + "main_xml_content": """ + + + + + Skipped + + + + + """, + "pr_xml_file": "unittest/file2.xml", + "pr_xml_content": """ + + + + + Skipped + + + + + """, + "target_url": "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect", + "s3_prefix": "tvm-jenkins-artifacts-prod", + "jenkins_prefix": "ci.tlcpack.ai", + "common_main_build": """{"build_number": "4115", "state": "success"}""", + "commit_sha": "SHA", + "expected_url": "issues/11594/comments", + "expected_body": """\n\nNo additional skipped tests found in this branch for commit SHA.""", + }, + "unable-to-run": { + "main_xml_file": "unittest/file1.xml", + "main_xml_content": """ + + + """, + "pr_xml_file": "unittest/file2.xml", + "pr_xml_content": """ + + + """, + "target_url": "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect", + "s3_prefix": "tvm-jenkins-artifacts-prod", + "jenkins_prefix": "ci.tlcpack.ai", + "common_main_build": """{"build_number": "4115", "state": "failed"}""", + "commit_sha": "SHA", + "expected_url": "issues/11594/comments", + "expected_body": """\n\nUnable to run tests bot because main failed to pass CI at SHA.""", + }, +} +# pylint: enable=line-too-long + + +@tvm.testing.skip_if_wheel_test +@pytest.mark.parametrize( + [ + "main_xml_file", + "main_xml_content", + "pr_xml_file", + "pr_xml_content", + "target_url", + "s3_prefix", + "jenkins_prefix", + "common_main_build", + "commit_sha", + "expected_url", + "expected_body", + ], + [tuple(d.values()) for d in TEST_DATA_SKIPPED_BOT.values()], + ids=TEST_DATA_SKIPPED_BOT.keys(), +) +# pylint: enable=line-too-long +def test_skipped_tests_comment( + tmpdir_factory, + main_xml_file, + main_xml_content, + pr_xml_file, + pr_xml_content, + target_url, + s3_prefix, + jenkins_prefix, + common_main_build, + commit_sha, + expected_url, + expected_body, +): + """ + Test that a comment with a link to the docs is successfully left on PRs + """ + skipped_tests_script = REPO_ROOT / "tests" / "scripts" / "github_skipped_tests_comment.py" + + def write_xml_file(root_dir, xml_file, xml_content): + shutil.rmtree(root_dir, ignore_errors=True) + file = root_dir / xml_file + file.parent.mkdir(parents=True) + with open(file, "w") as f: + f.write(textwrap.dedent(xml_content)) + + git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) + git.run("init") + git.run("checkout", "-b", "main") + git.run("remote", "add", "origin", "https://github.com/apache/tvm.git") + + pr_test_report_dir = Path(git.cwd) / "pr-reports" + write_xml_file(pr_test_report_dir, pr_xml_file, pr_xml_content) + main_test_report_dir = Path(git.cwd) / "main-reports" + write_xml_file(main_test_report_dir, main_xml_file, main_xml_content) + + proc = subprocess.run( + [ + str(skipped_tests_script), + "--dry-run", + f"--s3-prefix={s3_prefix}", + f"--jenkins-prefix={jenkins_prefix}", + f"--common-main-build={common_main_build}", + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env={"TARGET_URL": target_url, "COMMIT_SHA": commit_sha}, + encoding="utf-8", + cwd=git.cwd, + check=False, + ) + if proc.returncode != 0: + raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}") + + assert f"Dry run, would have posted {expected_url} with data {expected_body}." in proc.stderr + + @tvm.testing.skip_if_wheel_test @pytest.mark.parametrize( "target_url,base_url,commit_sha,expected_url,expected_body", diff --git a/tests/scripts/github_skipped_tests_comment.py b/tests/scripts/github_skipped_tests_comment.py new file mode 100755 index 000000000000..ef0630620b97 --- /dev/null +++ b/tests/scripts/github_skipped_tests_comment.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import json +import os +import logging +import argparse +import subprocess +import sys +from urllib import error +from xml.etree import ElementTree + +import requests + +from git_utils import git, GitHubRepo, parse_remote +from cmd_utils import init_log + +SKIPPED_TESTS_COMMENT_MARKER = "\n\n" +GITHUB_ACTIONS_BOT_LOGIN = "github-actions[bot]" + +PR_TEST_REPORT_DIR = "pr-reports" +MAIN_TEST_REPORT_DIR = "main-reports" + + +def run_subprocess(command): + logging.info(f"Running command {command}") + proc = subprocess.run(command, shell=True, stdout=subprocess.PIPE, encoding="utf-8") + if proc.returncode != 0: + raise RuntimeError(f"Command failed {command}:\nstdout:\n{proc.stdout}") + return proc + + +def retrieve_test_report(s3_url, target_dir): + command = f"aws s3 cp {s3_url} {target_dir} --recursive" + run_subprocess(command) + + +def get_common_commit_sha(): + command = "git merge-base origin/main HEAD" + proc = run_subprocess(command) + return proc.stdout.strip() + + +def get_main_jenkins_build_number(github, common_commit): + json = github.get(f"commits/{common_commit}/status") + for status in reversed(json["statuses"]): + if status["context"] != "tvm-ci/branch": + continue + state = status["state"] + target_url = str(status["target_url"]) + build_number = ( + target_url[target_url.find("job/main") : len(target_url)] + .strip("job/main/") + .strip("/display/redirect") + ) + assert build_number.isdigit() + return {"build_number": build_number, "state": state} + raise RuntimeError(f"Failed to find main build number for commit {common_commit}") + + +def retrieve_test_reports(common_main_build, pr_number, build_number, s3_prefix): + cur_build_s3_link = ( + f"s3://{s3_prefix}/tvm/PR-{str(pr_number)}/{str(build_number)}/pytest-results" + ) + retrieve_test_report(cur_build_s3_link, PR_TEST_REPORT_DIR) + + common_build_s3_link = f"s3://{s3_prefix}/tvm/main/{common_main_build}/pytest-results" + retrieve_test_report(common_build_s3_link, MAIN_TEST_REPORT_DIR) + + +def get_pr_and_build_numbers(target_url): + target_url = target_url[target_url.find("PR-") : len(target_url)] + split = target_url.split("/") + pr_number = split[0].strip("PR-") + build_number = split[1] + return {"pr_number": pr_number, "build_number": build_number} + + +def build_test_set(directory): + subdir_to_skipped = {} + subdirs = [ + item for item in os.listdir(directory) if os.path.isdir(os.path.join(directory, item)) + ] + for subdir in subdirs: + subdir_to_skipped[subdir] = set() + for root, _, files in os.walk(directory + "/" + subdir): + for file in files: + test_report = ElementTree.parse(root + "/" + file) + for testcase in test_report.iter("testcase"): + skipped = testcase.find("skipped") + if skipped is not None: + key = testcase.attrib["classname"] + "#" + testcase.attrib["name"] + subdir_to_skipped[subdir].add(key) + return subdir_to_skipped + + +def to_node_name(dir_name: str): + return dir_name.replace("_", ": ", 1) + + +def build_comment( + common_commit_sha, + common_main_build, + skipped_list, + pr_number, + build_number, + commit_sha, + jenkins_prefix, +): + if common_main_build["state"] != "success": + return f"{SKIPPED_TESTS_COMMENT_MARKER}Unable to run tests bot because main failed to pass CI at {common_commit_sha}." + + if len(skipped_list) == 0: + return f"{SKIPPED_TESTS_COMMENT_MARKER}No additional skipped tests found in this branch for commit {commit_sha}." + + text = ( + f"{SKIPPED_TESTS_COMMENT_MARKER}The list below shows some tests that ran in main {common_commit_sha} but were " + f"skipped in the CI build of {commit_sha}:\n" + f"```\n" + ) + for skip in skipped_list: + text += skip + "\n" + text += ( + f"```\nA detailed report of ran tests is [here](https://{jenkins_prefix}/job/tvm/job/PR-{str(pr_number)}" + f"/{str(build_number)}/testReport/)." + ) + return text + + +def get_pr_comments(github, url): + try: + return github.get(url) + except error.HTTPError as e: + logging.exception(f"Failed to retrieve PR comments: {url}: {e}") + return [] + + +def search_for_docs_comment(comments): + for comment in comments: + if ( + comment["user"]["login"] == GITHUB_ACTIONS_BOT_LOGIN + and SKIPPED_TESTS_COMMENT_MARKER in comment["body"] + ): + return comment + return None + + +if __name__ == "__main__": + help = ( + "Compares the skipped tests of this PR against the last successful build on main. Also comments on the PR " + "issue when tests are skipped in this PR and not on main." + ) + parser = argparse.ArgumentParser(description=help) + parser.add_argument("--remote", default="origin", help="ssh remote to parse") + parser.add_argument("--s3-prefix", default="tvm-jenkins-artifacts-prod") + parser.add_argument("--jenkins-prefix", default="ci.tlcpack.ai") + parser.add_argument("--common-main-build") + parser.add_argument( + "--dry-run", + action="store_true", + default=False, + help="run but don't send any request to GitHub", + ) + args = parser.parse_args() + init_log() + + remote = git(["config", "--get", f"remote.{args.remote}.url"]) + user, repo = parse_remote(remote) + + target_url = os.environ["TARGET_URL"] + pr_and_build = get_pr_and_build_numbers(target_url) + + commit_sha = os.environ["COMMIT_SHA"] + + if not args.dry_run: + github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo) + common_commit_sha = get_common_commit_sha() + common_main_build = get_main_jenkins_build_number(github, common_commit_sha) + retrieve_test_reports( + common_main_build=common_main_build["build_number"], + pr_number=pr_and_build["pr_number"], + build_number=pr_and_build["build_number"], + s3_prefix=args.s3_prefix, + ) + else: + assert args.common_main_build is not None + common_main_build = json.loads(args.common_main_build) + common_commit_sha = os.environ["COMMIT_SHA"] + + main_tests = build_test_set(MAIN_TEST_REPORT_DIR) + build_tests = build_test_set(PR_TEST_REPORT_DIR) + + skipped_list = [] + for subdir, skipped_set in build_tests.items(): + skipped_main = main_tests[subdir] + if skipped_main is None: + logging.warning(f"Could not find directory {subdir} in main.") + continue + + diff_set = skipped_set - skipped_main + if len(diff_set) != 0: + for test in diff_set: + skipped_list.append(f"{to_node_name(subdir)} -> {test}") + + # Sort the list to maintain an order in the output. Helps when validating the output in tests. + skipped_list.sort() + + if len(skipped_list) == 0: + logging.info("No skipped tests found.") + + body = build_comment( + common_commit_sha, + common_main_build, + skipped_list, + pr_and_build["pr_number"], + pr_and_build["build_number"], + commit_sha, + args.jenkins_prefix, + ) + url = f'issues/{pr_and_build["pr_number"]}/comments' + if not args.dry_run: + # For now, only comment for PRs open by driazati, gigiblender and areusch. + get_pr_url = f'pulls/{pr_and_build["pr_number"]}' + pull_request_body = github.get(get_pr_url) + author = pull_request_body["user"]["login"] + if author not in ["driazati", "gigiblender", "areusch"]: + logging.info(f"Skipping this action for user {author}") + sys.exit(0) + + pr_comments = get_pr_comments(github, url) + comment = search_for_docs_comment(pr_comments) + + if comment is not None: + comment_url = comment["url"] + comment_id = comment_url[comment_url.find("comments/") : len(comment_url)].strip( + "comments/" + ) + github.patch(f"issues/comments/{comment_id}", {"body": body}) + else: + github.post(url, {"body": body}) + else: + logging.info(f"Dry run, would have posted {url} with data {body}.") From 61c034ae27712d5cab4720b3f259df68cf004ac2 Mon Sep 17 00:00:00 2001 From: Huan Mei <352648791@qq.com> Date: Thu, 25 Aug 2022 23:44:50 +0800 Subject: [PATCH 042/704] [DOC] fix code-block error in debuggging TVM part (#12597) The code block in part Debuggging TVM is not showing up. Just fix it. --- docs/dev/how_to/debugging_tvm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/dev/how_to/debugging_tvm.rst b/docs/dev/how_to/debugging_tvm.rst index 6060f797b3e4..0ad44fdd17ce 100644 --- a/docs/dev/how_to/debugging_tvm.rst +++ b/docs/dev/how_to/debugging_tvm.rst @@ -60,7 +60,7 @@ optimization). To enable VLOGging, do the following: Examples: -.. code-block: shell +.. code-block:: shell # enable VLOG(0), VLOG(1), VLOG(2) in all files. $ TVM_LOG_DEBUG=DEFAULT=2 python3 -c 'import tvm' From b547106fdeb634d2fc692d8a516899c4abe6edbc Mon Sep 17 00:00:00 2001 From: Lite Ye Date: Thu, 25 Aug 2022 11:45:43 -0400 Subject: [PATCH 043/704] [CI] github_cc_reviewers: Catch all exceptions so all reviewers can be processed (#12578) In a recent change, `github.post` throws `RuntimeError` instead of `HTTPError` when the requested reviewer isn't a project collaborator. This prevents other reviewers to be added to the PR, for example, https://github.com/apache/tvm/runs/8001367110?check_suite_focus=true. This PR changes the caller to catch any exception so the execution won't be interrupted. Co-authored-by: driazati <9407960+driazati@users.noreply.github.com> --- tests/scripts/github_cc_reviewers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/scripts/github_cc_reviewers.py b/tests/scripts/github_cc_reviewers.py index bfc0077b6691..d8323221a7b0 100755 --- a/tests/scripts/github_cc_reviewers.py +++ b/tests/scripts/github_cc_reviewers.py @@ -16,6 +16,7 @@ # specific language governing permissions and limitations # under the License. +import sys import os import json import argparse @@ -106,5 +107,8 @@ def find_reviewers(body: str) -> List[str]: for reviewer in to_add: try: github.post(f"pulls/{number}/requested_reviewers", {"reviewers": [reviewer]}) - except error.HTTPError as e: + except KeyboardInterrupt: + sys.exit() + except (RuntimeError, error.HTTPError) as e: + # Catch any exception so other reviewers can be processed print(f"Failed to add reviewer {reviewer}: {e}") From 399f2e9b7006c95a2ebf0b3d35cdbacb340dd68d Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Thu, 25 Aug 2022 16:48:40 +0100 Subject: [PATCH 044/704] [microNPU] Remove xfail from tests relating to #12511 (#12570) Removes tests previously marked as xfail since the issue has now been resolved. --- tests/python/contrib/test_ethosu/test_codegen.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py index ae7d0821bb7f..28ea48f00932 100644 --- a/tests/python/contrib/test_ethosu/test_codegen.py +++ b/tests/python/contrib/test_ethosu/test_codegen.py @@ -347,7 +347,6 @@ def binary_elementwise(lhs, rhs): ([1, 4, 4], [4, 1]), ], ) -@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511") def test_binary_add_with_non_4d_shapes( request, accel_type, @@ -606,7 +605,6 @@ def rounding_right_shift(lhs, rhs): @pytest.mark.parametrize("accel_type", ACCEL_TYPES) @pytest.mark.parametrize("ifm_shape", [(3, 2), (1, 15, 11, 7), (3, 1, 12), (400,)]) @pytest.mark.parametrize("ifm_scale, ifm_zp, ofm_scale, ofm_zp", [(1, 0, 1, 0), (0.015, 3, 0.2, 5)]) -@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511") def test_ethosu_identity_codegen( request, ifm_shape, ifm_scale, ifm_zp, ofm_scale, ofm_zp, accel_type ): @@ -655,7 +653,6 @@ def generate_output_data(input_data): ((8, 7, 3), (-4, 1, 8, -2)), ], ) -@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511") def test_relay_reshape_codegen(ifm_shape, new_shape, accel_type): np.random.seed(0) @@ -688,7 +685,6 @@ def create_model(): ([5000], [123], [2151]), ], ) -@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511") def test_tflite_slice(request, accel_type, ifm_shape, begin, size): np.random.seed(0) @@ -724,7 +720,6 @@ def strided_slice_func(x): "ifm_shape", [[1, 5, 12, 4], [1, 1, 2], [4, 3, 2], [10, 20], [345]], ) -@pytest.mark.xfail(reason="See https://github.com/apache/tvm/issues/12511") def test_ethosu_unary_elementwise( request, accel_type, From f7c143608f9bb45dce8e3f93c3a89275a7c104f6 Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Thu, 25 Aug 2022 17:17:22 +0100 Subject: [PATCH 045/704] [ETHOSN] Support conversion of add to depthwise (#12531) In similar fashion to the conversion of mul to depthwise, this commit converts add when one input is a constant of shape [1, ..., n] to a depthwise convolution. If neither input is a constant, the add is offloaded naturally like before. The addition testing has been improved to use pytest features. --- python/tvm/relay/op/contrib/ethosn.py | 43 +++- src/relay/backend/contrib/ethosn/codegen.cc | 8 +- .../contrib/ethosn/convert_equivalent.cc | 109 ++++++++- .../contrib/test_ethosn/infrastructure.py | 3 +- .../contrib/test_ethosn/test_addition.py | 214 +++++++++++++----- .../test_ethosn/test_convert_equivalents.py | 99 +++++--- .../contrib/test_ethosn/test_networks.py | 2 +- 7 files changed, 377 insertions(+), 101 deletions(-) diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py index 73dd6b735775..83972bd08b41 100644 --- a/python/tvm/relay/op/contrib/ethosn.py +++ b/python/tvm/relay/op/contrib/ethosn.py @@ -215,6 +215,24 @@ def qnn_mul_pattern(): input_is_right = gen_mul_inputs(is_constant(), wildcard()) return input_is_left | input_is_right + def qnn_add_pattern(): + add_op = is_op("qnn.add") + gen_add_inputs = lambda x, y: add_op( + x, + y, + is_constant(), + is_constant(), + is_constant(), + is_constant(), + is_constant(), + is_constant(), + ) + two_inputs = gen_add_inputs(wildcard(), wildcard()) + input_is_left = gen_add_inputs(wildcard(), is_constant()) + input_is_right = gen_add_inputs(is_constant(), wildcard()) + + return input_is_left | input_is_right | two_inputs + def check_conv2d(extract): """Check if a conv2d is supported by Ethos-N.""" if not ethosn_available(): @@ -289,8 +307,24 @@ def check_resize(extract): return _ethosn.resize(extract) + def check_add(extract): + """Check if an addition is supported by Ethos-N.""" + if not ethosn_available(): + return False + # Do not support scalar constants for now + check_scalar = lambda i: isinstance(i, tvm.relay.Constant) and len(i.data.shape) == 0 + if check_scalar(extract.args[0]) or check_scalar(extract.args[1]): + return False + + inputs = extract.args[0:2] + if any([isinstance(i, tvm.relay.Constant) for i in inputs]): + extract = _ethosn.ConvertQnnAdd(extract) + return _ethosn.conv2d(extract) + return _ethosn.addition(extract) + return [ ("ethos-n.qnn_mul", qnn_mul_pattern(), check_mul), + ("ethos-n.qnn_add", qnn_add_pattern(), check_add), ("ethos-n.qnn_conv2d", qnn_conv_pattern(), check_conv2d), ("ethos-n.qnn_avg_pool2d", qnn_avg_pool2d_pattern(), check_avg_pool2d), ("ethos-n.qnn_sigmoid", qnn_sigmoid_pattern(), check_sigmoid), @@ -332,15 +366,6 @@ def reshape(expr): return _ethosn.reshape(expr) -@tvm.ir.register_op_attr("qnn.add", "target.ethos-n") -def qnn_add(expr): - """Check if an addition is supported by Ethos-N.""" - if not ethosn_available(): - return False - - return _ethosn.addition(expr) - - @tvm.ir.register_op_attr("qnn.concatenate", "target.ethos-n") def qnn_concatenate(expr): """Check if a concatenate is supported by Ethos-N.""" diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc index bc4613b80155..69672a143585 100644 --- a/src/relay/backend/contrib/ethosn/codegen.cc +++ b/src/relay/backend/contrib/ethosn/codegen.cc @@ -104,9 +104,9 @@ void InferTensorsVisitor::InferCall(const CallNode* cn) { params.input_info = GetTensorInfo(tensor_table_, call); err += EthosnAPI::Reshape(call, ¶ms); tensor_table_[cn->args[0]] = {params.input_info}; - } else if (IsEthosnOp(call, "qnn.add")) { + } else if (IsEthosnFunc(call, "ethos-n.qnn_add")) { AdditionParams params; - err += EthosnAPI::Addition(call, ¶ms); + err += EthosnAPI::Addition(cn->op.as()->body, ¶ms); tensor_table_[cn->args[0]] = {params.lhs_info}; tensor_table_[cn->args[1]] = {params.rhs_info}; } else if (IsEthosnFunc(call, "ethos-n.qnn_sigmoid")) { @@ -296,7 +296,7 @@ sl::TensorsAndId ConstructNetworkVisitor::HandleCall(const CallNode* cn) { } else if (IsEthosnOp(call, "reshape")) { if ((err = MakeReshapeLayer(call, &tensor))) ReportFatalError(call, err); return MakeOps(tensor); - } else if (IsEthosnOp(call, "qnn.add")) { + } else if (IsEthosnFunc(call, "ethos-n.qnn_add")) { if ((err = MakeAdditionLayer(call, &tensor))) ReportFatalError(call, err); return MakeOps(tensor); } else if (IsEthosnFunc(call, "ethos-n.qnn_sigmoid")) { @@ -468,7 +468,7 @@ EthosnError ConstructNetworkVisitor::MakeReshapeLayer(const Call& call, EthosnError ConstructNetworkVisitor::MakeAdditionLayer(const Call& call, sl::TensorAndId* out) { AdditionParams params; - if (auto err = EthosnAPI::Addition(call, ¶ms)) { + if (auto err = EthosnAPI::Addition(call->op.as()->body, ¶ms)) { return err; } diff --git a/src/relay/backend/contrib/ethosn/convert_equivalent.cc b/src/relay/backend/contrib/ethosn/convert_equivalent.cc index 6b64467047f4..12b5a12afb35 100644 --- a/src/relay/backend/contrib/ethosn/convert_equivalent.cc +++ b/src/relay/backend/contrib/ethosn/convert_equivalent.cc @@ -38,6 +38,20 @@ namespace relay { namespace contrib { namespace ethosn { +/*! + * \brief Apply constant folding on an expression. + * + * \param expr The expression to fold. + * \param fold_qnn Whether to fold constants for QNN operations. + * \returns The new folded expression. + */ +Expr FoldConstantExpr(const Expr& expr, bool fold_qnn = true) { + auto mod = IRModule::FromExpr(expr); + mod = transform::FoldConstant(fold_qnn)(mod); + auto entry_func = Downcast(mod->Lookup("main")); + return expr.as() == nullptr ? entry_func->body : entry_func; +} + /*! * \brief Converts qnn.mul to mathematically equivalent * qnn.conv2d depthwise operation. @@ -65,7 +79,9 @@ Expr ConvertQnnMultiply(const Expr& expr) { const auto* input_constant = input2.as(); ICHECK(input_constant) << "Expected ConstantNode but got " << input2->GetTypeKey(); - const auto* input_constant_tt = input_constant->checked_type().as(); + Type input_constant_type = input_constant->checked_type(); + const auto* input_constant_tt = input_constant_type.as(); + ICHECK(input_constant) << "Expected TensorTypeNode but got " << input_constant_type->GetTypeKey(); int channels = input_constant_tt->shape.back().as()->value; runtime::NDArray input_data = input_constant->data; @@ -93,6 +109,83 @@ Expr ConvertQnnMultiply(const Expr& expr) { TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertQnnMultiply") .set_body_typed(ConvertQnnMultiply); +/*! + * \brief Converts qnn.add to a mathematically equivalent + * qnn.conv2d depthwise operation. + */ +Expr ConvertQnnAdd(const Expr& expr) { + Call call = Downcast(expr); + + Expr input1 = call->args[0]; + Expr input2 = call->args[1]; + Expr input1_scale = call->args[2]; + Expr input1_zero_point = call->args[3]; + Expr input2_scale = call->args[4]; + Expr input2_zero_point = call->args[5]; + // Reverse the inputs if the constant is first input + if (call->args[0]->IsInstance()) { + input1 = call->args[1]; + input2 = call->args[0]; + input1_scale = call->args[4]; + input1_zero_point = call->args[5]; + input2_scale = call->args[2]; + input2_zero_point = call->args[3]; + } + Expr output_scale = call->args[6]; + Expr output_zero_point = call->args[7]; + + const auto* input_constant = input2.as(); + ICHECK(input_constant) << "Expected ConstantNode but got " << input2->GetTypeKey(); + Type input_constant_type = input_constant->checked_type(); + const auto* input_constant_tt = input_constant_type.as(); + ICHECK(input_constant) << "Expected TensorTypeNode but got " << input_constant_type->GetTypeKey(); + int channels = input_constant_tt->shape.back().as()->value; + + // Create the identity kernel. The kernel data is constructed such that it produces an identity + // operation in the quantized space. Therefore, the input is not scaled in any way which allows + // us to later use the bias to perform the addition. + float input_scale_value = GetScalarFromConstant(input1_scale); + float output_scale_value = GetScalarFromConstant(output_scale); + float identity_kernel_scale_ub = std::min(output_scale_value / input_scale_value, 1.f); + float identity_kernel_scale_lb = (1.f / 255.f); + float identity_kernel_scale_target = (identity_kernel_scale_ub + identity_kernel_scale_lb) / 2.f; + float identity_kernel_scale_recip_rounded = std::round(1.f / identity_kernel_scale_target); + float identity_kernel_scale_value = 1.f / identity_kernel_scale_recip_rounded; + Constant identity_kernel_scale = + MakeConstantScalar(DataType::Float(32), identity_kernel_scale_value); + Constant identity_kernel_zero_point = MakeConstantScalar(DataType::Int(32), 0); + float identity_kernel_quantized_data = identity_kernel_scale_recip_rounded; + std::vector identity_kernel_data(channels, + static_cast(identity_kernel_quantized_data)); + Constant identity_kernel = + MakeConstantTensor(input_constant_tt->dtype, {1, 1, channels, 1}, identity_kernel_data); + + // Calculate the bias, this is where the addition happens. The bias values are calculated by + // scaling the constant input to input_scale * identity_kernel_scale. + Constant bias_scale = + MakeConstantScalar(DataType::Float(32), input_scale_value * identity_kernel_scale_value); + Constant bias_zero_point = MakeConstantScalar(DataType::Int(32), 0); + Expr requantize_bias = + qnn::MakeRequantize(input2, input2_scale, input2_zero_point, bias_scale, bias_zero_point, -1, + "None", "None", DataType::Int(32)); + Expr reshape_bias = MakeReshape(requantize_bias, {channels}); + Constant bias = Downcast(FoldConstantExpr(reshape_bias)); + + // Make depthwise conv2d operation + Expr conv2d = + qnn::MakeQnnConv2D(input1, identity_kernel, input1_zero_point, identity_kernel_zero_point, + input1_scale, identity_kernel_scale, {1, 1}, {0, 0, 0, 0}, {1, 1}, + channels, channels, {1, 1}, "NHWC", "HWOI", "NHWC", DataType::Int(32)); + Expr bias_add = MakeBiasAdd(conv2d, bias, 3); + Expr requantize = + qnn::MakeRequantize(bias_add, input1_scale, input1_zero_point, output_scale, + output_zero_point, -1, "None", "None", input_constant_tt->dtype); + + return InferType(requantize); +} + +TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.ConvertQnnAdd").set_body_typed(ConvertQnnAdd); + class ConvertEquivalentsMutator : public MixedModeMutator { public: Expr Rewrite_(const CallNode* pre, const Expr& post) override { @@ -108,11 +201,25 @@ class ConvertEquivalentsMutator : public MixedModeMutator { Expr new_func_body = ConvertQnnMultiply(func->body); new_func = WithFields(func, func->params, new_func_body); new_func = WithAttr(std::move(new_func), attr::kComposite, String("ethos-n.qnn_conv2d")); + } else if (composite_name == "ethos-n.qnn_add" && CheckCanConvertAdd(func->body)) { + Expr new_func_body = ConvertQnnAdd(func->body); + new_func = WithFields(func, func->params, new_func_body); + new_func = WithAttr(std::move(new_func), attr::kComposite, String("ethos-n.qnn_conv2d")); } Call new_call = WithFields(call, new_func); return Downcast(new_call); } + + private: + /*! + * \brief Check whether add can be converted to depthwise, or whether + * it should be offloaded as a normal add operation. + */ + bool CheckCanConvertAdd(const Expr& expr) { + Call call = Downcast(expr); + return call->args[0]->IsInstance() || call->args[1]->IsInstance(); + } }; tvm::transform::Pass ConvertEquivalents() { diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py index c227ef5c3aea..a1c8ca0a32d2 100644 --- a/tests/python/contrib/test_ethosn/infrastructure.py +++ b/tests/python/contrib/test_ethosn/infrastructure.py @@ -83,7 +83,8 @@ def make_module(func, params): def make_ethosn_composite(ethosn_expr, name): vars = relay.analysis.free_vars(ethosn_expr) - func = relay.Function([relay.Var("a")], ethosn_expr) + inner_vars = [relay.Var(v.name_hint, v.type_annotation) for v in vars] + func = relay.Function(inner_vars, ethosn_expr) func = func.with_attr("Composite", name) call = relay.Call(func, vars) return call diff --git a/tests/python/contrib/test_ethosn/test_addition.py b/tests/python/contrib/test_ethosn/test_addition.py index cc8e030d372d..72981182e17f 100644 --- a/tests/python/contrib/test_ethosn/test_addition.py +++ b/tests/python/contrib/test_ethosn/test_addition.py @@ -25,11 +25,37 @@ from . import infrastructure as tei -def _get_model(input_shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtype): +def _get_model( + lhs_shape, + rhs_shape, + lhs_zp, + lhs_sc, + rhs_zp, + rhs_sc, + out_zp, + out_sc, + dtype, + lhs_is_constant=False, + rhs_is_constant=False, +): """Return a model and any parameters it may have""" - a = relay.var("a", shape=input_shape, dtype=dtype) - b = relay.var("b", shape=input_shape, dtype=dtype) + iinfo = np.iinfo(dtype) + data_min = iinfo.min + data_max = iinfo.max + + if lhs_is_constant: + a_data = np.random.randint(data_min, data_max + 1, size=lhs_shape, dtype=dtype) + a = relay.const(a_data, dtype=dtype) + else: + a = relay.var("a", shape=lhs_shape, dtype=dtype) + + if rhs_is_constant: + b_data = np.random.randint(data_min, data_max + 1, size=rhs_shape, dtype=dtype) + b = relay.const(b_data, dtype=dtype) + else: + b = relay.var("b", shape=rhs_shape, dtype=dtype) + model = relay.qnn.op.add( lhs=a, rhs=b, @@ -43,74 +69,156 @@ def _get_model(input_shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtyp return model -def _get_addition_qnn_params(dtype, input1_zp, input1_sc, input2_zp, input2_sc): - input1_max = input1_sc * (255 - input1_zp) - input1_min = -input1_sc * input1_zp - input2_max = input2_sc * (255 - input2_zp) - input2_min = -input2_sc * input2_zp +def _get_addition_qnn_params(dtype): + iinfo = np.iinfo(dtype) + data_min = iinfo.min + data_max = iinfo.max + lhs_zp = np.random.randint(data_min, data_max) + lhs_sc = np.random.random() * 2 + rhs_zp = np.random.randint(data_min, data_max) + rhs_sc = np.random.random() * 2 + + input1_max = lhs_sc * (255 - lhs_zp) + input1_min = -lhs_sc * lhs_zp + input2_max = rhs_sc * (255 - rhs_zp) + input2_min = -rhs_sc * rhs_zp output_max = input1_max + input2_max output_min = input1_min + input2_min output_sc = (output_max - output_min) / 255 output_zp = -int(output_min / output_sc) - return output_zp, output_sc + return lhs_zp, lhs_sc, rhs_zp, rhs_sc, output_zp, output_sc + + +@requires_ethosn +@pytest.mark.parametrize("dtype", ["uint8", "int8"]) +@pytest.mark.parametrize("shape", [(1, 22, 9, 9), (1, 27, 21, 16)]) +def test_addition(dtype, shape): + """Compare Addition output with TVM.""" + np.random.seed(0) + + iinfo = np.iinfo(dtype) + data_min = iinfo.min + data_max = iinfo.max + lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype) + + outputs = [] + inputs = { + "a": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=shape, dtype=dtype)), + "b": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=shape, dtype=dtype)), + } + model = _get_model(shape, shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtype) + for npu in [False, True]: + mod = tei.make_module(model, []) + outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu)) + + tei.verify(outputs, dtype, 1) + + +@requires_ethosn +@pytest.mark.parametrize("dtype", ["uint8", "int8"]) +@pytest.mark.parametrize( + "lhs_shape,rhs_shape", + [ + ((1, 4, 4, 8), (1, 1, 1, 8)), + ((1, 16, 12, 4), (4,)), + ], +) +def test_addition_to_depthwise_rhs_constant(dtype, lhs_shape, rhs_shape): + """Compare addition to depthwise with TVM.""" + np.random.seed(0) + + iinfo = np.iinfo(dtype) + data_min = iinfo.min + data_max = iinfo.max + lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype) + + model = _get_model( + lhs_shape, + rhs_shape, + lhs_zp, + lhs_sc, + rhs_zp, + rhs_sc, + out_zp, + out_sc, + dtype, + lhs_is_constant=False, + rhs_is_constant=True, + ) + inputs = { + "a": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=lhs_shape, dtype=dtype)) + } + outputs = [] + for npu in [False, True]: + mod = tei.make_module(model, {}) + outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu)) + tei.verify(outputs, dtype, 1) @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) -def test_addition(dtype): - zp_min = np.iinfo(dtype).min - zp_max = np.iinfo(dtype).max - trials = [ - ((1, 22, 9, 9), zp_min + 24, 1.057, zp_max - 3, 0.452), - ((1, 27, 21, 16), zp_min + 79, 0.850, 24, 0.380), - ((1, 7, 12, 28), zp_min + 125, 1.293, zp_max - 16, 0.320), - ((1, 14, 9, 6), zp_min + 14, 0.942, zp_max - 28, 1.562), - ((1, 13, 16, 22), zp_min + 15, 0.727, zp_max - 75, 0.461), - ] +@pytest.mark.parametrize( + "lhs_shape,rhs_shape", + [ + ((1, 8), (1, 20, 15, 8)), + ], +) +def test_addition_to_depthwise_lhs_constant(dtype, lhs_shape, rhs_shape): + """Compare addition to depthwise with TVM.""" np.random.seed(0) - for shape, rhs_zp, rhs_sc, lhs_zp, lhs_sc in trials: - outputs = [] - inputs = { - "a": tvm.nd.array(np.random.randint(zp_min, zp_max + 1, size=shape, dtype=dtype)), - "b": tvm.nd.array(np.random.randint(zp_min, zp_max + 1, size=shape, dtype=dtype)), - } - out_zp, out_sc = _get_addition_qnn_params(dtype, lhs_zp, lhs_sc, rhs_zp, rhs_sc) - model = _get_model(shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtype) - for npu in [False, True]: - mod = tei.make_module(model, []) - outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu)) - tei.verify(outputs, dtype, 2) + iinfo = np.iinfo(dtype) + data_min = iinfo.min + data_max = iinfo.max + lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype) + + model = _get_model( + lhs_shape, + rhs_shape, + lhs_zp, + lhs_sc, + rhs_zp, + rhs_sc, + out_zp, + out_sc, + dtype, + lhs_is_constant=True, + rhs_is_constant=False, + ) + inputs = { + "b": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=rhs_shape, dtype=dtype)) + } + outputs = [] + for npu in [False, True]: + mod = tei.make_module(model, {}) + outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu)) + tei.verify(outputs, dtype, 1) @requires_ethosn -def test_addition_failure(): - trials = [ +@pytest.mark.parametrize( + "dtype,shape,err_msg", + [ ( - (2, 4, 4, 4), "uint8", - 0, - 1, - 0, - 1, - 0, - 1, + (2, 4, 4, 4), "batch size=2, batch size must = 1; batch size=2, batch size must = 1", ), ( - (1, 4, 4, 4), "int16", - 0, - 1, - 0, - 1, - 0, - 1, - "dtype='int16', dtype must be either uint8, int8 or int32; dtype='int16', dtype must be either uint8, int8 or int32", + (1, 4, 4, 4), + "dtype='int16', dtype must be either uint8, int8 or int32; dtype='int16', " + "dtype must be either uint8, int8 or int32", ), - ] + ], +) +def test_addition_failure(dtype, shape, err_msg): + """Check addition error messages.""" + np.random.seed(0) + + lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype) - for shape, dtype, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, err_msg in trials: - model = _get_model(shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtype) - mod = tei.make_ethosn_partition(model) - tei.test_error(mod, {}, err_msg) + model = _get_model(shape, shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtype) + model = tei.make_ethosn_composite(model, "ethos-n.qnn_add") + mod = tei.make_ethosn_partition(model) + tei.test_error(mod, {}, err_msg) diff --git a/tests/python/contrib/test_ethosn/test_convert_equivalents.py b/tests/python/contrib/test_ethosn/test_convert_equivalents.py index 570009422067..fe9b346691b6 100644 --- a/tests/python/contrib/test_ethosn/test_convert_equivalents.py +++ b/tests/python/contrib/test_ethosn/test_convert_equivalents.py @@ -24,8 +24,10 @@ from tvm import relay from tvm.testing import requires_ethosn from tvm.relay.op.contrib.ethosn import ConvertEquivalents +from tvm.relay import ExprVisitor from . import infrastructure as tei +from .test_addition import _get_addition_qnn_params def _assert_structural_equal(a, b): @@ -38,35 +40,6 @@ def _assert_structural_equal(a, b): assert tvm.ir.structural_equal(a, b), reason -def _create_npu_module(inputs, expr, composite_name, ext_func_name): - """Wraps an operator as an NPU module.""" - gen_vars = lambda prefix, vars: [ - relay.var( - prefix + var.name_hint, shape=var.type_annotation.shape, dtype=var.type_annotation.dtype - ) - for var in vars - ] - - mod = tvm.ir.IRModule() - - func = relay.Function(relay.analysis.free_vars(expr), expr) - func = func.with_attr("Composite", composite_name) - inner_vars = gen_vars("inner_", inputs) - call = relay.Call(func, inner_vars) - - func2 = relay.Function(relay.analysis.free_vars(call), call) - func2 = func2.with_attr("Compiler", "ethos-n") - func2 = func2.with_attr("global_symbol", ext_func_name) - mod[ext_func_name] = func2 - mod = relay.transform.InferType()(mod) - - outer_vars = gen_vars("outer_", inputs) - out = relay.Call(mod.get_global_var(ext_func_name), outer_vars) - mod["main"] = relay.Function(relay.analysis.free_vars(out), out) - mod = relay.transform.InferType()(mod) - return mod - - @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) @pytest.mark.parametrize("shape,channels", [((1, 4, 4, 8), 8), ((1, 16, 12, 4), 4)]) @@ -101,7 +74,8 @@ def before(): relay.const(output_sc, "float32"), relay.const(output_zp, "int32"), ) - return _create_npu_module([x], expr, "ethos-n.qnn_mul", "ext_func") + composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_mul") + return tei.make_ethosn_partition(composite) def expected(): constant_shape_hwoi = (1, 1, channels, 1) @@ -134,9 +108,70 @@ def expected(): relay.const(output_zp, "int32"), out_dtype=dtype, ) - return _create_npu_module([x], expr, "ethos-n.qnn_conv2d", "ext_func") + composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_conv2d") + return tei.make_ethosn_partition(composite) mod = before() mod = ConvertEquivalents()(mod) expected_mod = expected() - _assert_structural_equal(mod["ext_func"], expected_mod["ext_func"]) + _assert_structural_equal(mod["ethos-n_0"], expected_mod["ethos-n_0"]) + + +@requires_ethosn +@pytest.mark.parametrize("reverse_inputs", [True, False]) +def test_add_to_depthwise(reverse_inputs): + """ + Check that add is converted correctly. + """ + dtype = "uint8" + lhs_shape = (1, 2, 4, 8) + rhs_shape = (1, 1, 1, 8) + np.random.seed(0) + + iinfo = np.iinfo(dtype) + data_min = iinfo.min + data_max = iinfo.max + lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc = _get_addition_qnn_params(dtype) + + x = relay.var("x", shape=lhs_shape, dtype=dtype) + y_data = np.random.randint(data_min, data_max + 1, size=rhs_shape, dtype=dtype) + + def before(): + y = relay.const(y_data) + expr = relay.qnn.op.add( + lhs=y if reverse_inputs else x, + rhs=x if reverse_inputs else y, + lhs_scale=relay.const(lhs_sc, "float32"), + lhs_zero_point=relay.const(lhs_zp, "int32"), + rhs_scale=relay.const(rhs_sc, "float32"), + rhs_zero_point=relay.const(rhs_zp, "int32"), + output_scale=relay.const(out_sc, "float32"), + output_zero_point=relay.const(out_zp, "int32"), + ) + composite = tei.make_ethosn_composite(expr, "ethos-n.qnn_add") + return tei.make_ethosn_partition(composite) + + class ConversionChecker(ExprVisitor): + """ + Pass to check the new composite function is in the expected format. + """ + + sequence = ["qnn.conv2d", "nn.bias_add", "qnn.requantize"] + + def visit_function(self, fn): + composite_name = fn.attrs["Composite"] + expected = "ethos-n.qnn_conv2d" + assert ( + composite_name == expected + ), f"Expected Composite attribute {expected} but got {composite_name}" + super().visit_function(fn) + + def visit_call(self, call): + op_name = call.op.name + expected_name = self.sequence.pop() + assert op_name == expected_name, f"Got operator {op_name} but expected {expected_name}" + super().visit_call(call) + + mod = before() + mod = ConvertEquivalents()(mod) + mod = ConversionChecker().visit(mod["ethos-n_0"].body.op) diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py index abc4d37a7359..d16bf5bf325c 100644 --- a/tests/python/contrib/test_ethosn/test_networks.py +++ b/tests/python/contrib/test_ethosn/test_networks.py @@ -143,7 +143,7 @@ def test_resnet_50_int8(): # codegen, which could come about from either a change in Support Library # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. - _compile_hash = {"60404ad60fc2bfbb68464d8a14cc0452", "4225fa951c145bb1e48e28cad6a3bdd4"} + _compile_hash = {"9245965b2c01e7f3d9b478e38a186eb4", "4225fa951c145bb1e48e28cad6a3bdd4"} _test_image_network( model_url="https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/" "models/Quantized/resnet_50_quantized.tflite", From 21db1eb586f14b272b36f7e33830acc630823b5f Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Thu, 25 Aug 2022 10:23:46 -0600 Subject: [PATCH 046/704] [F2QI] Fix a rounding error on AvgPool when input and output affine scales differ (#12577) cc @sfvaroglu @AndrewZhaoLuo --- .../transform/fake_quantization_to_integer.py | 64 ++++++++++++++++--- .../test_pass_fake_quantization_to_integer.py | 15 ++--- 2 files changed, 61 insertions(+), 18 deletions(-) diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py index 58dcc3477f6a..bb874c131cd8 100644 --- a/python/tvm/relay/transform/fake_quantization_to_integer.py +++ b/python/tvm/relay/transform/fake_quantization_to_integer.py @@ -114,11 +114,26 @@ def adaptive_avgpool1d(expr, type_map): """Rewrite an adaptive avgpool op""" arg = expr.args[0] t = type_map[arg] - arg = relay.op.cast(arg, "int32") + out_t = type_map[expr] + if not ( + approx_equal(t.scale, out_t.scale) + and approx_equal(t.zero_point, out_t.zero_point) + and tvm.ir.structural_equal(t.dtype, out_t.dtype) + ): + arg = relay.qnn.op.requantize( + arg, + t.scale, + t.zero_point, + out_t.scale, + out_t.zero_point, + out_dtype="int32", + axis=t.axis, + ) + else: + arg = relay.op.cast(arg, "int32") output_size = expr.attrs.output_size out = relay.op.nn.adaptive_avg_pool1d(arg, output_size) - out = relay.op.cast(out, t.dtype) - return [out, t] + return [out, TensorAffineType(out_t.scale, out_t.zero_point, "int32", out_t.axis)] @register_fake_quantization_to_integer("nn.avg_pool2d") @@ -126,10 +141,25 @@ def avgpool2d(expr, type_map): """Rewrite a avgpool op""" arg = expr.args[0] t = type_map[arg] - arg = relay.op.cast(arg, "int32") + out_t = type_map[expr] + if not ( + approx_equal(t.scale, out_t.scale) + and approx_equal(t.zero_point, out_t.zero_point) + and tvm.ir.structural_equal(t.dtype, out_t.dtype) + ): + arg = relay.qnn.op.requantize( + arg, + t.scale, + t.zero_point, + out_t.scale, + out_t.zero_point, + out_dtype="int32", + axis=t.axis, + ) + else: + arg = relay.op.cast(arg, "int32") out = relay.op.nn.avg_pool2d(arg, **expr.attrs) - out = relay.op.cast(out, t.dtype) - return [out, t] + return [out, TensorAffineType(out_t.scale, out_t.zero_point, "int32", out_t.axis)] @register_fake_quantization_to_integer("nn.global_avg_pool2d") @@ -137,10 +167,26 @@ def global_avgpool2d(expr, type_map): """Rewrite a global_avgpool op""" arg = expr.args[0] t = type_map[arg] - arg = relay.op.cast(arg, "int32") + out_t = type_map[expr] + out_t = type_map[expr] + if not ( + approx_equal(t.scale, out_t.scale) + and approx_equal(t.zero_point, out_t.zero_point) + and tvm.ir.structural_equal(t.dtype, out_t.dtype) + ): + arg = relay.qnn.op.requantize( + arg, + t.scale, + t.zero_point, + out_t.scale, + out_t.zero_point, + out_dtype="int32", + axis=t.axis, + ) + else: + arg = relay.op.cast(arg, "int32") out = relay.op.nn.global_avg_pool2d(arg) - out = relay.op.cast(out, t.dtype) - return [out, t] + return [out, TensorAffineType(out_t.scale, out_t.zero_point, "int32", out_t.axis)] @register_fake_quantization_to_integer("broadcast_to") diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py index cdf5fd42a138..a63d82e68750 100644 --- a/tests/python/relay/test_pass_fake_quantization_to_integer.py +++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py @@ -281,10 +281,9 @@ def test_fake_quantize_maxpool(): def test_fake_quantize_adaptive_avgpool1d(output_size): x = relay.var("x", shape=[1, 128, 768], dtype="int8") - zero = relay.const(0) - x = relay.qnn.op.dequantize(x, relay.const(2.0), zero) + x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(-12)) op = relay.op.nn.adaptive_avg_pool1d(x, output_size) - op = relay.qnn.op.quantize(op, relay.const(2.0), zero) + op = relay.qnn.op.quantize(op, relay.const(0.5), relay.const(10)) x_np = np.random.randint(-128, 127, size=[1, 128, 768], dtype="int8") @@ -294,10 +293,9 @@ def test_fake_quantize_adaptive_avgpool1d(output_size): def test_fake_quantize_avgpool(): x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8") - zero = relay.const(0) - x = relay.qnn.op.dequantize(x, relay.const(2.0), zero) + x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(-12)) op = relay.op.nn.avg_pool2d(x, [3, 3]) - op = relay.qnn.op.quantize(op, relay.const(2.0), zero) + op = relay.qnn.op.quantize(op, relay.const(0.5), relay.const(10)) x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8") @@ -307,10 +305,9 @@ def test_fake_quantize_avgpool(): def test_fake_quantize_global_avg_pool(): x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8") - zero = relay.const(0) - x = relay.qnn.op.dequantize(x, relay.const(2.0), zero) + x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(-12)) op = relay.op.nn.global_avg_pool2d(x) - op = relay.qnn.op.quantize(op, relay.const(2.0), zero) + op = relay.qnn.op.quantize(op, relay.const(0.5), relay.const(10)) x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8") From bb00a15c265ba12341aede06bbbf216dda585211 Mon Sep 17 00:00:00 2001 From: wrongtest Date: Fri, 26 Aug 2022 01:42:57 +0800 Subject: [PATCH 047/704] [CUDA][CodeGen] Fix cuda codegen's fp16 inf literal (#12581) * Fix cuda codegen's fp16 inf literal * add relay testcase --- src/target/source/codegen_cuda.cc | 6 ++++-- tests/python/relay/test_op_level3.py | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc index 2239cef92060..d96e0cbc1679 100644 --- a/src/target/source/codegen_cuda.cc +++ b/src/target/source/codegen_cuda.cc @@ -1197,8 +1197,10 @@ inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p) break; } case 16: { - os << "__float2half_rn"; - os << '(' << std::scientific << op->value << 'f' << ')'; + os << "__float2half_rn" << '('; + FloatImm const_f32 = FloatImm(DataType::Float(32), op->value); + PrintConst(const_f32.get(), os, p); + os << ')'; break; } default: diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py index 2fe40ae2f88e..400f7dcf0b42 100644 --- a/tests/python/relay/test_op_level3.py +++ b/tests/python/relay/test_op_level3.py @@ -1344,7 +1344,7 @@ def verify_gather_nd(xshape, yshape, y_data, batch_dims=0, indices_dtype="int32" verify_gather_nd((2, 2, 2), (2, 2, 1), [[[1], [0]], [[0], [1]]], 1, indices_dtype="uint32") -def _verify_infiniteness_ops(relay_op, ref_op): +def _verify_infiniteness_ops(relay_op, ref_op, target="llvm", dev=None): for dtype in ["float32", "float16", "float16", "int32", "int16"]: shape = (2, 8, 8) x = relay.var("x", relay.TensorType(shape, dtype)) @@ -1359,17 +1359,25 @@ def _verify_infiniteness_ops(relay_op, ref_op): ] = np.infty data.ravel()[np.random.choice(data.size, int(data.size * 0.5), replace=False)] = np.nan - op_res = create_executor().evaluate(y, {x: data}) + op_res = create_executor(target=target, device=dev).evaluate(y, {x: data}) ref_res = ref_op(data) np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01) +@tvm.testing.requires_gpu def test_isfinite(): - _verify_infiniteness_ops(relay.isfinite, np.isfinite) + for target, dev in tvm.testing.enabled_targets(): + if target not in ["llvm", "cuda"]: + continue + _verify_infiniteness_ops(relay.isfinite, np.isfinite, target=target, dev=dev) +@tvm.testing.requires_gpu def test_isinf(): - _verify_infiniteness_ops(relay.isinf, np.isinf) + for target, dev in tvm.testing.enabled_targets(): + if target not in ["llvm", "cuda"]: + continue + _verify_infiniteness_ops(relay.isinf, np.isinf, target=target, dev=dev) def test_unravel_index(target, dev, executor_kind): From 01fcdfcf5fcfda313df4e176ca3d919b076f77fc Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Thu, 25 Aug 2022 10:55:58 -0700 Subject: [PATCH 048/704] [ci] Default to n=2 for test parallelism (#12414) * Revert "[skip ci] Revert "[ci] Default to n=2 for test parallelism (#12376)" (#12413)" This reverts commit 478b672f2b7bb37f529fa6477b3c4ac353217b7a. * [ci] Default to n=2 for test parallelism This is attempt #2 of #12376 which was reverted in #12413. The changes in `plugin.py` should keep all the tests on the same node so sporadic failures don't happen due to scheduling. Co-authored-by: driazati --- Jenkinsfile | 60 +++++++++++++++++++++++++++++-- ci/jenkins/Jenkinsfile.j2 | 2 +- ci/jenkins/macros.j2 | 3 ++ python/tvm/testing/plugin.py | 42 ++++++++++++++++++++++ tests/scripts/setup-pytest-env.sh | 8 ++++- 5 files changed, 111 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 15cd4927d0ba..8c1ce9ed5020 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-08-15T16:55:31.189354 +// Generated at 2022-08-19T15:38:38.311410 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -97,7 +97,7 @@ properties([ upstream_revision = null // command to start a docker container -docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS' +docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME' docker_build = 'docker/build.sh' // timeout in minutes max_time = 180 @@ -610,6 +610,7 @@ def lint() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'TVM_NUM_SHARDS=2', + 'TEST_STEP_NAME=Lint', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh ( @@ -629,6 +630,7 @@ def lint() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'TVM_NUM_SHARDS=2', + 'TEST_STEP_NAME=Lint', 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh ( @@ -1225,6 +1227,7 @@ def shard_run_unittest_GPU_1_of_3() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', + 'TEST_STEP_NAME=unittest: GPU', 'TVM_NUM_SHARDS=3', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -1340,6 +1343,7 @@ def shard_run_unittest_GPU_2_of_3() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', + 'TEST_STEP_NAME=unittest: GPU', 'TVM_NUM_SHARDS=3', 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -1421,6 +1425,7 @@ def shard_run_unittest_GPU_3_of_3() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', + 'TEST_STEP_NAME=unittest: GPU', 'TVM_NUM_SHARDS=3', 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -1499,6 +1504,7 @@ def shard_run_integration_CPU_1_of_10() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', + 'TEST_STEP_NAME=integration: CPU', 'TVM_NUM_SHARDS=10', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -1574,6 +1580,7 @@ def shard_run_integration_CPU_2_of_10() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', + 'TEST_STEP_NAME=integration: CPU', 'TVM_NUM_SHARDS=10', 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -1649,6 +1656,7 @@ def shard_run_integration_CPU_3_of_10() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', + 'TEST_STEP_NAME=integration: CPU', 'TVM_NUM_SHARDS=10', 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -1724,6 +1732,7 @@ def shard_run_integration_CPU_4_of_10() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', + 'TEST_STEP_NAME=integration: CPU', 'TVM_NUM_SHARDS=10', 'TVM_SHARD_INDEX=3', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -1799,6 +1808,7 @@ def shard_run_integration_CPU_5_of_10() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', + 'TEST_STEP_NAME=integration: CPU', 'TVM_NUM_SHARDS=10', 'TVM_SHARD_INDEX=4', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -1874,6 +1884,7 @@ def shard_run_integration_CPU_6_of_10() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', + 'TEST_STEP_NAME=integration: CPU', 'TVM_NUM_SHARDS=10', 'TVM_SHARD_INDEX=5', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -1949,6 +1960,7 @@ def shard_run_integration_CPU_7_of_10() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', + 'TEST_STEP_NAME=integration: CPU', 'TVM_NUM_SHARDS=10', 'TVM_SHARD_INDEX=6', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -2024,6 +2036,7 @@ def shard_run_integration_CPU_8_of_10() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', + 'TEST_STEP_NAME=integration: CPU', 'TVM_NUM_SHARDS=10', 'TVM_SHARD_INDEX=7', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -2099,6 +2112,7 @@ def shard_run_integration_CPU_9_of_10() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', + 'TEST_STEP_NAME=integration: CPU', 'TVM_NUM_SHARDS=10', 'TVM_SHARD_INDEX=8', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -2174,6 +2188,7 @@ def shard_run_integration_CPU_10_of_10() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', + 'TEST_STEP_NAME=integration: CPU', 'TVM_NUM_SHARDS=10', 'TVM_SHARD_INDEX=9', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -2250,6 +2265,7 @@ def shard_run_python_i386_1_of_5() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=i386', + 'TEST_STEP_NAME=python: i386', 'TVM_NUM_SHARDS=5', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -2325,6 +2341,7 @@ def shard_run_python_i386_2_of_5() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=i386', + 'TEST_STEP_NAME=python: i386', 'TVM_NUM_SHARDS=5', 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -2400,6 +2417,7 @@ def shard_run_python_i386_3_of_5() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=i386', + 'TEST_STEP_NAME=python: i386', 'TVM_NUM_SHARDS=5', 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -2474,6 +2492,7 @@ def shard_run_python_i386_4_of_5() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=i386', + 'TEST_STEP_NAME=python: i386', 'TVM_NUM_SHARDS=5', 'TVM_SHARD_INDEX=3', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -2548,6 +2567,7 @@ def shard_run_python_i386_5_of_5() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=i386', + 'TEST_STEP_NAME=python: i386', 'TVM_NUM_SHARDS=5', 'TVM_SHARD_INDEX=4', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -2623,6 +2643,7 @@ def shard_run_test_Hexagon_1_of_7() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', + 'TEST_STEP_NAME=test: Hexagon', 'TVM_NUM_SHARDS=7', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -2697,6 +2718,7 @@ def shard_run_test_Hexagon_2_of_7() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', + 'TEST_STEP_NAME=test: Hexagon', 'TVM_NUM_SHARDS=7', 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -2770,6 +2792,7 @@ def shard_run_test_Hexagon_3_of_7() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', + 'TEST_STEP_NAME=test: Hexagon', 'TVM_NUM_SHARDS=7', 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -2843,6 +2866,7 @@ def shard_run_test_Hexagon_4_of_7() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', + 'TEST_STEP_NAME=test: Hexagon', 'TVM_NUM_SHARDS=7', 'TVM_SHARD_INDEX=3', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -2916,6 +2940,7 @@ def shard_run_test_Hexagon_5_of_7() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', + 'TEST_STEP_NAME=test: Hexagon', 'TVM_NUM_SHARDS=7', 'TVM_SHARD_INDEX=4', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -2989,6 +3014,7 @@ def shard_run_test_Hexagon_6_of_7() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', + 'TEST_STEP_NAME=test: Hexagon', 'TVM_NUM_SHARDS=7', 'TVM_SHARD_INDEX=5', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -3062,6 +3088,7 @@ def shard_run_test_Hexagon_7_of_7() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', + 'TEST_STEP_NAME=test: Hexagon', 'TVM_NUM_SHARDS=7', 'TVM_SHARD_INDEX=6', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -3136,6 +3163,7 @@ def shard_run_integration_aarch64_1_of_4() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', + 'TEST_STEP_NAME=integration: aarch64', 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -3210,6 +3238,7 @@ def shard_run_integration_aarch64_2_of_4() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', + 'TEST_STEP_NAME=integration: aarch64', 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -3284,6 +3313,7 @@ def shard_run_integration_aarch64_3_of_4() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', + 'TEST_STEP_NAME=integration: aarch64', 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -3358,6 +3388,7 @@ def shard_run_integration_aarch64_4_of_4() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', + 'TEST_STEP_NAME=integration: aarch64', 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=3', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -3433,6 +3464,7 @@ def shard_run_topi_GPU_1_of_4() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', + 'TEST_STEP_NAME=topi: GPU', 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -3506,6 +3538,7 @@ def shard_run_topi_GPU_2_of_4() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', + 'TEST_STEP_NAME=topi: GPU', 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -3579,6 +3612,7 @@ def shard_run_topi_GPU_3_of_4() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', + 'TEST_STEP_NAME=topi: GPU', 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -3652,6 +3686,7 @@ def shard_run_topi_GPU_4_of_4() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', + 'TEST_STEP_NAME=topi: GPU', 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=3', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -3726,6 +3761,7 @@ def shard_run_frontend_GPU_1_of_6() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', + 'TEST_STEP_NAME=frontend: GPU', 'TVM_NUM_SHARDS=6', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -3799,6 +3835,7 @@ def shard_run_frontend_GPU_2_of_6() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', + 'TEST_STEP_NAME=frontend: GPU', 'TVM_NUM_SHARDS=6', 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -3872,6 +3909,7 @@ def shard_run_frontend_GPU_3_of_6() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', + 'TEST_STEP_NAME=frontend: GPU', 'TVM_NUM_SHARDS=6', 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -3945,6 +3983,7 @@ def shard_run_frontend_GPU_4_of_6() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', + 'TEST_STEP_NAME=frontend: GPU', 'TVM_NUM_SHARDS=6', 'TVM_SHARD_INDEX=3', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -4018,6 +4057,7 @@ def shard_run_frontend_GPU_5_of_6() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', + 'TEST_STEP_NAME=frontend: GPU', 'TVM_NUM_SHARDS=6', 'TVM_SHARD_INDEX=4', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -4091,6 +4131,7 @@ def shard_run_frontend_GPU_6_of_6() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', + 'TEST_STEP_NAME=frontend: GPU', 'TVM_NUM_SHARDS=6', 'TVM_SHARD_INDEX=5', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -4165,6 +4206,7 @@ def shard_run_topi_aarch64_1_of_2() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', + 'TEST_STEP_NAME=topi: aarch64', 'TVM_NUM_SHARDS=2', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -4243,6 +4285,7 @@ def shard_run_topi_aarch64_2_of_2() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', + 'TEST_STEP_NAME=topi: aarch64', 'TVM_NUM_SHARDS=2', 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -4321,6 +4364,7 @@ def shard_run_frontend_aarch64_1_of_2() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', + 'TEST_STEP_NAME=frontend: aarch64', 'TVM_NUM_SHARDS=2', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -4394,6 +4438,7 @@ def shard_run_frontend_aarch64_2_of_2() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', + 'TEST_STEP_NAME=frontend: aarch64', 'TVM_NUM_SHARDS=2', 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -4468,6 +4513,7 @@ def shard_run_test_Cortex_M_1_of_8() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', + 'TEST_STEP_NAME=test: Cortex-M', 'TVM_NUM_SHARDS=8', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -4546,6 +4592,7 @@ def shard_run_test_Cortex_M_2_of_8() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', + 'TEST_STEP_NAME=test: Cortex-M', 'TVM_NUM_SHARDS=8', 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -4619,6 +4666,7 @@ def shard_run_test_Cortex_M_3_of_8() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', + 'TEST_STEP_NAME=test: Cortex-M', 'TVM_NUM_SHARDS=8', 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -4692,6 +4740,7 @@ def shard_run_test_Cortex_M_4_of_8() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', + 'TEST_STEP_NAME=test: Cortex-M', 'TVM_NUM_SHARDS=8', 'TVM_SHARD_INDEX=3', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -4765,6 +4814,7 @@ def shard_run_test_Cortex_M_5_of_8() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', + 'TEST_STEP_NAME=test: Cortex-M', 'TVM_NUM_SHARDS=8', 'TVM_SHARD_INDEX=4', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -4838,6 +4888,7 @@ def shard_run_test_Cortex_M_6_of_8() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', + 'TEST_STEP_NAME=test: Cortex-M', 'TVM_NUM_SHARDS=8', 'TVM_SHARD_INDEX=5', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -4911,6 +4962,7 @@ def shard_run_test_Cortex_M_7_of_8() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', + 'TEST_STEP_NAME=test: Cortex-M', 'TVM_NUM_SHARDS=8', 'TVM_SHARD_INDEX=6', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -4984,6 +5036,7 @@ def shard_run_test_Cortex_M_8_of_8() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', + 'TEST_STEP_NAME=test: Cortex-M', 'TVM_NUM_SHARDS=8', 'TVM_SHARD_INDEX=7', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -5058,6 +5111,7 @@ def shard_run_test_RISC_V_1_of_1() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=riscv', + 'TEST_STEP_NAME=test: RISC-V', 'TVM_NUM_SHARDS=1', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -5361,6 +5415,7 @@ stage('Test') { docker_init(ci_cpu) init_git() withEnv(['PLATFORM=cpu', + 'TEST_STEP_NAME=unittest: CPU', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -5435,6 +5490,7 @@ stage('Test') { docker_init(ci_cpu) init_git() withEnv(['PLATFORM=cpu', + 'TEST_STEP_NAME=frontend: CPU', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2 index 4960d4f0fa57..be2776c6d9e3 100644 --- a/ci/jenkins/Jenkinsfile.j2 +++ b/ci/jenkins/Jenkinsfile.j2 @@ -85,7 +85,7 @@ properties([ upstream_revision = null // command to start a docker container -docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS' +docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM --env SKIP_SLOW_TESTS --env TEST_STEP_NAME' docker_build = 'docker/build.sh' // timeout in minutes max_time = 180 diff --git a/ci/jenkins/macros.j2 b/ci/jenkins/macros.j2 index dbd6ac551db4..9d02ad68d6da 100644 --- a/ci/jenkins/macros.j2 +++ b/ci/jenkins/macros.j2 @@ -44,6 +44,7 @@ def {{ method_name }}() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM={{ platform }}', + 'TEST_STEP_NAME={{ name }}', 'TVM_NUM_SHARDS={{ num_shards }}', 'TVM_SHARD_INDEX={{ shard_index - 1 }}', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -75,6 +76,7 @@ def {{ method_name }}() { timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'TVM_NUM_SHARDS={{ num_shards }}', + 'TEST_STEP_NAME={{ name }}', 'TVM_SHARD_INDEX={{ shard_index - 1 }}', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { {{ caller() | trim | indent(width=6) }} @@ -121,6 +123,7 @@ def {{ method_name }}() { docker_init({{ docker_image }}) init_git() withEnv(['PLATFORM={{ platform }}', + 'TEST_STEP_NAME={{ name }}', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { {{ caller() | indent(width=12) | trim }} }) diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py index 1f4f983b7210..2d845b70ff11 100644 --- a/python/tvm/testing/plugin.py +++ b/python/tvm/testing/plugin.py @@ -37,6 +37,13 @@ import tvm from tvm.testing import utils +try: + from xdist.scheduler.loadscope import LoadScopeScheduling + + HAVE_XDIST = True +except ImportError: + HAVE_XDIST = False + MARKERS = { "gpu": "mark a test as requiring a gpu", @@ -319,3 +326,38 @@ def _parametrize_correlated_parameters(metafunc): names = ",".join(name for name, values in params) value_sets = zip(*[values for name, values in params]) metafunc.parametrize(names, value_sets, indirect=True, ids=ids) + + +# pytest-xdist isn't required but is used in CI, so guard on its presence +if HAVE_XDIST: + + def pytest_xdist_make_scheduler(config, log): + """ + Serialize certain tests for pytest-xdist that have inter-test + dependencies + """ + + class TvmTestScheduler(LoadScopeScheduling): + """ + Scheduler to serializer tests + """ + + def _split_scope(self, nodeid): + """ + Returns a specific string for classes of nodeids + """ + # NOTE: these tests contain inter-test dependencies and must be + # serialized + items = { + "test_tvm_testing_features": "functional-tests", + "tests/python/unittest/test_crt": "crt-tests", + "tests/python/driver/tvmc": "tvmc-tests", + } + + for nodeid_pattern, suite_name in items.items(): + if nodeid_pattern in nodeid: + return suite_name + + return nodeid + + return TvmTestScheduler(config, log) diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh index afb759c09356..d6c49a42819a 100755 --- a/tests/scripts/setup-pytest-env.sh +++ b/tests/scripts/setup-pytest-env.sh @@ -74,8 +74,14 @@ function run_pytest() { suite_name="${test_suite_name}-${current_shard}-${ffi_type}" + # Some test environments don't play well with parallelism + DEFAULT_PARALLELISM=2 + if [[ "${TEST_STEP_NAME:-default}" == "frontend: GPU"* ]] || [[ "${TEST_STEP_NAME:-default}" == "test: Hexagon"* ]]; then + DEFAULT_PARALLELISM=1 + fi + if [ ! "${extra_args[@]}" == *" -n"* ] && [! "${extra_args[@]}" == *" -dist"* ]; then - extra_args+=("-n=1") + extra_args+=("-n=$DEFAULT_PARALLELISM") fi exit_code=0 From 8d60b3cbbcacc5675383c353a1180be0cbc59cb9 Mon Sep 17 00:00:00 2001 From: Josh Fromm Date: Thu, 25 Aug 2022 13:05:09 -0700 Subject: [PATCH 049/704] [Runtime] Change default alignment to 64 bytes. (#12586) * Change default alignment to 64 bits. * Run dlpack test a few times. * Update alignment in tests. * Revert mma alignment change. * Change default printing of buffer. * Change crt runtime default allocation. --- include/tvm/runtime/device_api.h | 4 +- python/tvm/tir/tensor_intrin/cuda.py | 54 ++++--- src/printer/tir_text_printer.cc | 2 +- src/runtime/crt/common/crt_runtime_api.c | 2 +- tests/python/contrib/test_dlpack.py | 10 +- .../test_ethosu/test_tir_to_cs_translator.py | 68 ++++----- .../contrib/test_ethosu/test_vela_api.py | 18 +-- .../test_tir_analysis_calculate_workspace.py | 16 +- tests/python/unittest/test_tir_intrin.py | 8 +- .../unittest/test_tir_schedule_analysis.py | 6 +- .../unittest/test_tir_schedule_reduction.py | 16 +- .../test_tir_schedule_storage_align.py | 18 +-- .../unittest/test_tir_schedule_tensorize.py | 30 ++-- ..._tir_transform_convert_for_loops_serial.py | 8 +- ...est_tir_transform_inject_rolling_buffer.py | 12 +- tests/python/unittest/test_tir_usmp_algo.py | 18 +-- ...st_tir_usmp_analysis_extract_bufferinfo.py | 138 +++++++++--------- ...orm_convert_pool_allocations_to_offsets.py | 36 ++--- ..._tir_usmp_transform_create_io_allocates.py | 48 +++--- tests/python/unittest/test_tir_usmp_utils.py | 18 +-- .../unittest/test_tvmscript_complete.py | 18 +-- .../unittest/test_tvmscript_roundtrip.py | 36 ++--- .../unittest/test_tvmscript_syntax_sugar.py | 12 +- tests/python/unittest/test_tvmscript_type.py | 6 +- 24 files changed, 303 insertions(+), 299 deletions(-) diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index c3d83bf2993f..1bb10fa17ae6 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -52,10 +52,10 @@ enum DeviceAttrKind : int { }; /*! \brief Number of bytes each allocation must align to */ -constexpr int kAllocAlignment = 128; +constexpr int kAllocAlignment = 64; /*! \brief Number of bytes each allocation must align to in temporary allocation */ -constexpr int kTempAllocaAlignment = 128; +constexpr int kTempAllocaAlignment = 64; /*! \brief Maximum size that can be allocated on stack */ constexpr int kMaxStackAlloca = 1024; diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py index b4f5d1d331e5..64d7c24840ae 100644 --- a/python/tvm/tir/tensor_intrin/cuda.py +++ b/python/tvm/tir/tensor_intrin/cuda.py @@ -120,12 +120,12 @@ def ldmatrix_desc(warp_handle: T.handle, shared_handle: T.handle) -> None: shared_handle, shmem_shape, dtype, - align=128, + align=64, offset_factor=16, scope=shared_scope, ) warp = T.match_buffer( - warp_handle, (WARP_SIZE, local_size), dtype, align=128, offset_factor=16, scope="warp" + warp_handle, (WARP_SIZE, local_size), dtype, align=64, offset_factor=16, scope="warp" ) with T.block("root"): @@ -149,13 +149,13 @@ def ldmatrix_impl(warp_handle: T.handle, shared_handle: T.handle) -> None: shared_handle, shmem_shape, dtype, - align=128, + align=64, offset_factor=16, scope=shared_scope, strides=[s0, s1], ) warp = T.match_buffer( - warp_handle, (WARP_SIZE, local_size), dtype, align=128, offset_factor=16, scope="warp" + warp_handle, (WARP_SIZE, local_size), dtype, align=64, offset_factor=16, scope="warp" ) with T.block("root"): @@ -222,13 +222,13 @@ def maybe_swap(i, j): @T.prim_func def mma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None: A = T.match_buffer( - a, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp" + a, (WARP_SIZE, local_size), in_dtype, align=64, offset_factor=16, scope="warp" ) B = T.match_buffer( - b, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp" + b, (WARP_SIZE, local_size), in_dtype, align=64, offset_factor=16, scope="warp" ) C = T.match_buffer( - c, (WARP_SIZE, local_size_out), out_dtype, align=128, offset_factor=16, scope="warp" + c, (WARP_SIZE, local_size_out), out_dtype, align=64, offset_factor=16, scope="warp" ) with T.block("root"): @@ -262,13 +262,13 @@ def mma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def mma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None: A = T.match_buffer( - a, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp" + a, (WARP_SIZE, local_size), in_dtype, align=64, offset_factor=16, scope="warp" ) B = T.match_buffer( - b, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp" + b, (WARP_SIZE, local_size), in_dtype, align=64, offset_factor=16, scope="warp" ) C = T.match_buffer( - c, (WARP_SIZE, local_size_out), out_dtype, align=128, offset_factor=16, scope="warp" + c, (WARP_SIZE, local_size_out), out_dtype, align=64, offset_factor=16, scope="warp" ) with T.block("root"): @@ -510,11 +510,9 @@ def get_wmma_load_intrin( @T.prim_func def wmma_load_desc(a: T.handle, c: T.handle) -> None: - A = T.match_buffer( - a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=shared_scope - ) + A = T.match_buffer(a, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=shared_scope) C = T.match_buffer( - c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=wmma_fragment_scope + c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=wmma_fragment_scope ) with T.block("root"): T.reads(A[0:m_dim, 0:n_dim]) @@ -532,13 +530,13 @@ def wmma_load_impl(a: T.handle, c: T.handle) -> None: a, (m_dim, n_dim), dtype, - align=128, + align=64, offset_factor=16, scope=shared_scope, strides=[s1, s0], ) C = T.match_buffer( - c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=wmma_fragment_scope + c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=wmma_fragment_scope ) with T.block("root"): T.reads(A[0:m_dim, 0:n_dim]) @@ -569,7 +567,7 @@ def get_wmma_fill_intrin( @T.prim_func def wmma_fill_desc(c: T.handle) -> None: C = T.match_buffer( - c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator" + c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator" ) with T.block("root"): T.reads() @@ -582,7 +580,7 @@ def wmma_fill_desc(c: T.handle) -> None: @T.prim_func def wmma_fill_impl(c: T.handle) -> None: C = T.match_buffer( - c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator" + c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator" ) with T.block("root"): T.reads() @@ -610,9 +608,9 @@ def get_wmma_store_intrin( @T.prim_func def wmma_store_desc(a: T.handle, c: T.handle) -> None: A = T.match_buffer( - a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator" + a, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator" ) - C = T.match_buffer(c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=scope) + C = T.match_buffer(c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=scope) with T.block("root"): T.reads(A[0:m_dim, 0:n_dim]) T.writes(C[0:m_dim, 0:n_dim]) @@ -626,10 +624,10 @@ def wmma_store_impl(a: T.handle, c: T.handle) -> None: s1 = T.var("int32") s0 = T.var("int32") A = T.match_buffer( - a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator" + a, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator" ) C = T.match_buffer( - c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=scope, strides=[s1, s0] + c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=scope, strides=[s1, s0] ) with T.block("root"): T.reads(A[0:m_dim, 0:n_dim]) @@ -671,18 +669,18 @@ def maybe_swap(i, j): @T.prim_func def wmma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None: A = T.match_buffer( - a, (m_dim, k_dim), in_dtype, align=128, offset_factor=16, scope="wmma.matrix_a" + a, (m_dim, k_dim), in_dtype, align=64, offset_factor=16, scope="wmma.matrix_a" ) B = T.match_buffer( b, maybe_swap(k_dim, n_dim), in_dtype, - align=128, + align=64, offset_factor=16, scope="wmma.matrix_b", ) C = T.match_buffer( - c, (m_dim, n_dim), out_dtype, align=128, offset_factor=16, scope="wmma.accumulator" + c, (m_dim, n_dim), out_dtype, align=64, offset_factor=16, scope="wmma.accumulator" ) with T.block("root"): @@ -699,18 +697,18 @@ def wmma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def wmma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None: A = T.match_buffer( - a, (m_dim, k_dim), in_dtype, align=128, offset_factor=16, scope="wmma.matrix_a" + a, (m_dim, k_dim), in_dtype, align=64, offset_factor=16, scope="wmma.matrix_a" ) B = T.match_buffer( b, maybe_swap(k_dim, n_dim), in_dtype, - align=128, + align=64, offset_factor=16, scope="wmma.matrix_b", ) C = T.match_buffer( - c, (m_dim, n_dim), out_dtype, align=128, offset_factor=16, scope="wmma.accumulator" + c, (m_dim, n_dim), out_dtype, align=64, offset_factor=16, scope="wmma.accumulator" ) with T.block("root"): diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc index 894a9cec1e2a..cdfc8fd318fd 100644 --- a/src/printer/tir_text_printer.cc +++ b/src/printer/tir_text_printer.cc @@ -251,7 +251,7 @@ Doc TIRTextPrinter::BufferNode2Doc(const BufferNode* buf, Doc doc) { if (GetRef(buf).scope() != "global") { doc << ", scope=" << Doc::StrLiteral(GetRef(buf).scope()); } - if (buf->data_alignment != 128) { + if (buf->data_alignment != runtime::kAllocAlignment) { doc << ", align=" << buf->data_alignment; } if (buf->offset_factor != 1) { diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c index 2151c23f8462..7df610b53c45 100644 --- a/src/runtime/crt/common/crt_runtime_api.c +++ b/src/runtime/crt/common/crt_runtime_api.c @@ -104,7 +104,7 @@ int TVMDeviceAllocDataSpaceWithScope(DLDevice dev, int ndim, const int64_t* shap } nbytes *= (dtype.bits * dtype.lanes + 7) / 8; - int kAllocAlignment = 128; + int kAllocAlignment = 64; size_t align = (dtype.bits / 8) * dtype.lanes; if (align < kAllocAlignment) align = kAllocAlignment; return TVMDeviceAllocDataSpace(dev, nbytes, align, dtype, out_data); diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py index c71fc45d0346..4e65f79c518e 100644 --- a/tests/python/contrib/test_dlpack.py +++ b/tests/python/contrib/test_dlpack.py @@ -21,7 +21,7 @@ from tvm.contrib.dlpack import to_pytorch_func -def test(): +def verify_torch_dlpack(): a = np.random.randn(1337) tvm_a = tvm.nd.array(a) np.testing.assert_equal(tvm.nd.from_dlpack(tvm_a.to_dlpack()).numpy(), a) @@ -63,5 +63,11 @@ def test(): pass +def test_torch_dlpack(): + # Run dlpack interoperability test a few times to make sure it's stable. + for i in range(5): + verify_torch_dlpack() + + if __name__ == "__main__": - test() + test_torch_dlpack() diff --git a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py index 28522138cafc..e1a0e143281b 100644 --- a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py +++ b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py @@ -525,10 +525,10 @@ class SingleEthosuDepthwiseConv2D: def main(placeholder: T.handle, placeholder_1: T.handle, placeholder_2: T.handle, ethosu_depthwise_conv2d: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_1, [18], dtype="int8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_2, [30], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_3 = T.match_buffer(placeholder, [192], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_depthwise_conv2d_1 = T.match_buffer(ethosu_depthwise_conv2d, [126], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_1, [18], dtype="int8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_2, [30], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_3 = T.match_buffer(placeholder, [192], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_depthwise_conv2d_1 = T.match_buffer(ethosu_depthwise_conv2d, [126], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder_3[0], 0, 0, 0, T.float32(0.6), 11, "NHWC", 24, 3, 1, "int8", 6, 7, 3, 6, 0, 7, ethosu_depthwise_conv2d_1[0], 0, 0, 0, T.float32(0.26), 15, "NHWC", 21, 3, 1, 2, 3, 1, 1, 1, 1, placeholder_4[0], 18, 13, placeholder_5[0], 30, 0, 0, 0, 0, "CLIP", 15, 105, "TFL", "NONE", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -991,8 +991,8 @@ class SingleEthosuPooling: def main(placeholder: T.handle, placeholder_3: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [75], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [75], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_pooling", "int8", 5, 9, 3, 5, 0, 9, placeholder_4[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 5, 3, 5, 0, 5, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 15, 3, 1, "AVG", 2, 3, 2, 1, 1, 1, 1, 1, 1, 0, "CLIP", 10, 100, "TFL", "NONE", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1065,10 +1065,10 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) placeholder_2 = T.match_buffer( - placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1 + placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1 ) ethosu_write_2 = T.match_buffer( - ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1 + ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1 ) # body T.evaluate(T.call_extern( "ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "ADD", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) @@ -1084,8 +1084,8 @@ class SingleEthosuBinaryElementwiseSub: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SUB", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1099,8 +1099,8 @@ class SingleEthosuBinaryElementwiseMul: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MUL", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1115,8 +1115,8 @@ class SingleEthosuBinaryElementwiseMin: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MIN", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1131,8 +1131,8 @@ class SingleEthosuBinaryElementwiseMax: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MAX", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1147,8 +1147,8 @@ class SingleEthosuBinaryElementwiseShr: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [270], dtype="int32", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int32", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [270], dtype="int32", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int32", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SHR", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="int32")) __tvm_meta__ = None @@ -1163,8 +1163,8 @@ class SingleEthosuBinaryElementwiseShl: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [270], dtype="int32", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int32", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [270], dtype="int32", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int32", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SHL", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int32")) __tvm_meta__ = None @@ -1284,8 +1284,8 @@ class SingleEthosuBinaryElementwiseAddBroadcasting: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "ADD", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1299,8 +1299,8 @@ class SingleEthosuBinaryElementwiseSubBroadcasting: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SUB", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1314,8 +1314,8 @@ class SingleEthosuBinaryElementwiseMulBroadcasting: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MUL", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1330,8 +1330,8 @@ class SingleEthosuBinaryElementwiseMinBroadcasting: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MIN", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1346,8 +1346,8 @@ class SingleEthosuBinaryElementwiseMaxBroadcasting: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MAX", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8")) __tvm_meta__ = None @@ -1362,8 +1362,8 @@ class SingleEthosuBinaryElementwiseShrBroadcasting: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [27], dtype="int32", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int32", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [27], dtype="int32", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int32", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int32", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int32", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SHR", 1, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="int32")) __tvm_meta__ = None @@ -1378,8 +1378,8 @@ class SingleEthosuBinaryElementwiseShlBroadcasting: def main(placeholder: T.handle, ethosu_write: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - placeholder_2 = T.match_buffer(placeholder, [27], dtype="int32", elem_offset=0, align=128, offset_factor=1) - ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int32", elem_offset=0, align=128, offset_factor=1) + placeholder_2 = T.match_buffer(placeholder, [27], dtype="int32", elem_offset=0, align=64, offset_factor=1) + ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int32", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int32", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int32", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SHL", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int32")) __tvm_meta__ = None diff --git a/tests/python/contrib/test_ethosu/test_vela_api.py b/tests/python/contrib/test_ethosu/test_vela_api.py index e2e4b2cb3a91..75ca22d08202 100644 --- a/tests/python/contrib/test_ethosu/test_vela_api.py +++ b/tests/python/contrib/test_ethosu/test_vela_api.py @@ -50,16 +50,16 @@ def main( # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) placeholder_3 = T.match_buffer( - placeholder, [192], dtype="uint8", elem_offset=0, align=128, offset_factor=1 + placeholder, [192], dtype="uint8", elem_offset=0, align=64, offset_factor=1 ) placeholder_4 = T.match_buffer( - placeholder_1, [48], dtype="uint8", elem_offset=0, align=128, offset_factor=1 + placeholder_1, [48], dtype="uint8", elem_offset=0, align=64, offset_factor=1 ) placeholder_5 = T.match_buffer( - placeholder_2, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1 + placeholder_2, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1 ) ethosu_conv2d_1 = T.match_buffer( - ethosu_conv2d, [1024], dtype="uint8", elem_offset=0, align=128, offset_factor=1 + ethosu_conv2d, [1024], dtype="uint8", elem_offset=0, align=64, offset_factor=1 ) # body T.evaluate( @@ -142,20 +142,20 @@ def main( # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) placeholder_3 = T.match_buffer( - placeholder, [192], dtype="uint8", elem_offset=0, align=128, offset_factor=1 + placeholder, [192], dtype="uint8", elem_offset=0, align=64, offset_factor=1 ) placeholder_4 = T.match_buffer( - placeholder_1, [48], dtype="uint8", elem_offset=0, align=128, offset_factor=1 + placeholder_1, [48], dtype="uint8", elem_offset=0, align=64, offset_factor=1 ) placeholder_5 = T.match_buffer( - placeholder_2, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1 + placeholder_2, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1 ) # Per-channel weight scales placeholder_7 = T.match_buffer( - placeholder_6, [16], dtype="float32", elem_offset=0, align=128, offset_factor=1 + placeholder_6, [16], dtype="float32", elem_offset=0, align=64, offset_factor=1 ) ethosu_conv2d_1 = T.match_buffer( - ethosu_conv2d, [1024], dtype="uint8", elem_offset=0, align=128, offset_factor=1 + ethosu_conv2d, [1024], dtype="uint8", elem_offset=0, align=64, offset_factor=1 ) # body T.evaluate( diff --git a/tests/python/unittest/test_tir_analysis_calculate_workspace.py b/tests/python/unittest/test_tir_analysis_calculate_workspace.py index 8d3163c111c8..1d78458b930d 100644 --- a/tests/python/unittest/test_tir_analysis_calculate_workspace.py +++ b/tests/python/unittest/test_tir_analysis_calculate_workspace.py @@ -26,10 +26,10 @@ def primfunc_global_allocates(placeholder_144: T.handle, placeholder_145: T.handle, placeholder_146: T.handle, T_cast_48: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_13", "tir.noalias": True}) - placeholder_147 = T.match_buffer(placeholder_144, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_148 = T.match_buffer(placeholder_145, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_149 = T.match_buffer(placeholder_146, [512], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_49 = T.match_buffer(T_cast_48, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_147 = T.match_buffer(placeholder_144, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_148 = T.match_buffer(placeholder_145, [4608], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_149 = T.match_buffer(placeholder_146, [512], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_49 = T.match_buffer(T_cast_48, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_22 = T.allocate([131072], "int16", "global") DepthwiseConv2d_9 = T.allocate([100352], "int32", "global") @@ -57,10 +57,10 @@ def primfunc_global_allocates(placeholder_144: T.handle, placeholder_145: T.hand def primfunc_local_allocates(placeholder_162: T.handle, placeholder_163: T.handle, placeholder_164: T.handle, T_cast_76: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_9", "tir.noalias": True}) - placeholder_165 = T.match_buffer(placeholder_162, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_166 = T.match_buffer(placeholder_163, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_167 = T.match_buffer(placeholder_164, [512], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_77 = T.match_buffer(T_cast_76, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_165 = T.match_buffer(placeholder_162, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_166 = T.match_buffer(placeholder_163, [4608], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_167 = T.match_buffer(placeholder_164, [512], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_77 = T.match_buffer(T_cast_76, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1) sid_21 = T.allocate_const([0,1,2,3,4,5,6,7], "int8", [8]) # body PaddedInput_25 = T.allocate([131072], "int16", "global") diff --git a/tests/python/unittest/test_tir_intrin.py b/tests/python/unittest/test_tir_intrin.py index b8061fc0732a..f887f8877a22 100644 --- a/tests/python/unittest/test_tir_intrin.py +++ b/tests/python/unittest/test_tir_intrin.py @@ -203,7 +203,7 @@ def test_tir_fma(A: T.handle, B: T.handle, C: T.handle, d: T.handle) -> None: [n], strides=[stride], elem_offset=0, - align=128, + align=64, offset_factor=1, buffer_type="auto", ) @@ -212,7 +212,7 @@ def test_tir_fma(A: T.handle, B: T.handle, C: T.handle, d: T.handle) -> None: [n], strides=[stride_1], elem_offset=0, - align=128, + align=64, offset_factor=1, buffer_type="auto", ) @@ -221,7 +221,7 @@ def test_tir_fma(A: T.handle, B: T.handle, C: T.handle, d: T.handle) -> None: [n], strides=[stride_2], elem_offset=0, - align=128, + align=64, offset_factor=1, buffer_type="auto", ) @@ -230,7 +230,7 @@ def test_tir_fma(A: T.handle, B: T.handle, C: T.handle, d: T.handle) -> None: [n], strides=[stride_3], elem_offset=0, - align=128, + align=64, offset_factor=1, buffer_type="auto", ) diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py index d3e6033e880c..5524abbaf094 100644 --- a/tests/python/unittest/test_tir_schedule_analysis.py +++ b/tests/python/unittest/test_tir_schedule_analysis.py @@ -218,9 +218,9 @@ def test_get_tensorize_loop_mapping_conv2d_nchwc_vnni(): def test_get_tensorize_loop_mapping_matmul_mma(): @T.prim_func def matmul_16x16x16xf16f16f16_desc( - A: T.Buffer((16, 16), "float16", align=128, offset_factor=1), - B: T.Buffer((16, 16), "float16", align=128, offset_factor=1), - C: T.Buffer((16, 16), "float16", align=128, offset_factor=1), + A: T.Buffer((16, 16), "float16", align=64, offset_factor=1), + B: T.Buffer((16, 16), "float16", align=64, offset_factor=1), + C: T.Buffer((16, 16), "float16", align=64, offset_factor=1), ) -> None: with T.block("root"): T.reads(C[0:16, 0:16], A[0:16, 0:16], B[0:16, 0:16]) diff --git a/tests/python/unittest/test_tir_schedule_reduction.py b/tests/python/unittest/test_tir_schedule_reduction.py index f3503460e50a..1600b27f5e78 100644 --- a/tests/python/unittest/test_tir_schedule_reduction.py +++ b/tests/python/unittest/test_tir_schedule_reduction.py @@ -78,8 +78,8 @@ def matmul_decompose0(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def matmul_decompose1(a: T.handle, b: T.handle) -> None: - A = T.match_buffer(a, [32, 4, 128], elem_offset=0, align=128, offset_factor=1) - B = T.match_buffer(b, [32, 4], elem_offset=0, align=128, offset_factor=1) + A = T.match_buffer(a, [32, 4, 128], elem_offset=0, align=64, offset_factor=1) + B = T.match_buffer(b, [32, 4], elem_offset=0, align=64, offset_factor=1) for i0 in T.serial(0, 32): with T.block("blockized_B_init"): @@ -100,9 +100,9 @@ def matmul_decompose1(a: T.handle, b: T.handle) -> None: @T.prim_func def matmul_decompose2(a: T.handle, b: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - B = T.match_buffer(b, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + B = T.match_buffer(b, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) for i0, i1 in T.grid(128, 128): with T.block("update_init"): @@ -130,9 +130,9 @@ def matmul_decompose_fail3(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def matmul_decompose4(a: T.handle, b: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - B = T.match_buffer(b, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + B = T.match_buffer(b, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) # body with T.block("root"): T.reads([]) diff --git a/tests/python/unittest/test_tir_schedule_storage_align.py b/tests/python/unittest/test_tir_schedule_storage_align.py index 072640c8f3af..23cb5d3b5339 100644 --- a/tests/python/unittest/test_tir_schedule_storage_align.py +++ b/tests/python/unittest/test_tir_schedule_storage_align.py @@ -26,13 +26,13 @@ @T.prim_func def element_wise(a: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) # body with T.block("root"): T.reads([]) T.writes([]) - B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1) + B = T.alloc_buffer([128, 128], elem_offset=0, align=64, offset_factor=1) for i0 in T.serial(0, 128): for ax1 in T.serial(0, 128): with T.block("B"): @@ -50,13 +50,13 @@ def element_wise(a: T.handle, c: T.handle) -> None: @T.prim_func def element_wise_storage_align(a: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) # body with T.block("root"): T.reads([]) T.writes([]) - B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1) + B = T.alloc_buffer([128, 128], elem_offset=0, align=64, offset_factor=1) for i0 in T.serial(0, 128): for ax1 in T.serial(0, 128): with T.block("B"): @@ -75,13 +75,13 @@ def element_wise_storage_align(a: T.handle, c: T.handle) -> None: @T.prim_func def element_wise_invalid_annotation(a: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) # body with T.block("root"): T.reads([]) T.writes([]) - B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1) + B = T.alloc_buffer([128, 128], elem_offset=0, align=64, offset_factor=1) for i0 in T.serial(0, 128): for ax1 in T.serial(0, 128): with T.block("B"): diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py index 929a6cfa19bc..828dad2fc036 100644 --- a/tests/python/unittest/test_tir_schedule_tensorize.py +++ b/tests/python/unittest/test_tir_schedule_tensorize.py @@ -36,9 +36,9 @@ @T.prim_func def mma_desc(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (16, 16), align=128, offset_factor=1) - B = T.match_buffer(b, (16, 16), align=128, offset_factor=1) - C = T.match_buffer(c, (16, 16), align=128, offset_factor=1) + A = T.match_buffer(a, (16, 16), align=64, offset_factor=1) + B = T.match_buffer(b, (16, 16), align=64, offset_factor=1) + C = T.match_buffer(c, (16, 16), align=64, offset_factor=1) with T.block("root"): T.reads(C[0 : 16, 0 : 16], A[0 : 16, 0 : 16], B[0 : 16, 0 : 16]) @@ -51,9 +51,9 @@ def mma_desc(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def mma_intrin(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (16, 16), align=128, offset_factor=1) - B = T.match_buffer(b, (16, 16), align=128, offset_factor=1) - C = T.match_buffer(c, (16, 16), align=128, offset_factor=1) + A = T.match_buffer(a, (16, 16), align=64, offset_factor=1) + B = T.match_buffer(b, (16, 16), align=64, offset_factor=1) + C = T.match_buffer(c, (16, 16), align=64, offset_factor=1) with T.block("root"): T.reads(C[0 : 16, 0 : 16], A[0 : 16, 0 : 16], B[0 : 16, 0 : 16]) @@ -173,9 +173,9 @@ def matmul( @T.prim_func def tensorized_matmul(a: T.handle, b: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - B = T.match_buffer(b, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + B = T.match_buffer(b, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) for i_outer, j_outer in T.grid(8, 8): for i_inner_init, j_inner_init in T.grid(16, 16): @@ -375,9 +375,9 @@ def tensorized_batch_matmul_outer_product( @T.prim_func def annotated_mma_desc(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (16, 16), align=128, offset_factor=1) - B = T.match_buffer(b, (16, 16), align=128, offset_factor=1) - C = T.match_buffer(c, (16, 16), align=128, offset_factor=1) + A = T.match_buffer(a, (16, 16), align=64, offset_factor=1) + B = T.match_buffer(b, (16, 16), align=64, offset_factor=1) + C = T.match_buffer(c, (16, 16), align=64, offset_factor=1) with T.block("root"): T.reads(C[0 : 16, 0 : 16], A[0 : 16, 0 : 16], B[0 : 16, 0 : 16]) @@ -406,9 +406,9 @@ def annotated_matmul( @T.prim_func def annotated_tensorized_matmul(a: T.handle, b: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - B = T.match_buffer(b, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + B = T.match_buffer(b, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) for i_outer, j_outer in T.grid(8, 8): for i_inner_init, j_inner_init in T.grid(16, 16): diff --git a/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py b/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py index 38431705611b..1a3afdd4c1e2 100644 --- a/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py +++ b/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py @@ -26,10 +26,10 @@ def fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(placeholder_30: T.handle, placeholder_31: T.handle, placeholder_32: T.handle, T_cast_8: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2", "tir.noalias": True}) - placeholder_33 = T.match_buffer(placeholder_30, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_34 = T.match_buffer(placeholder_31, [3072], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_35 = T.match_buffer(placeholder_32, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_9 = T.match_buffer(T_cast_8, [12544], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_33 = T.match_buffer(placeholder_30, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_34 = T.match_buffer(placeholder_31, [3072], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_35 = T.match_buffer(placeholder_32, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_9 = T.match_buffer(T_cast_8, [12544], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_3 = T.allocate([150528], "int16", "global") for i0_i1_fused_3 in T.parallel(0, 28): diff --git a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py index 073a0ebd4e84..65a586b8ecfd 100644 --- a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py +++ b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py @@ -196,9 +196,9 @@ def main(A: T.handle, tensor: T.handle) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) # buffer definition - tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1) - A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1) - tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1) + tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1) + A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1) + tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.realize(tensor_1[0:1, 0:8, 0:8, 0:16], "") for ax1_outer in T.serial(0, 2): @@ -228,9 +228,9 @@ def main(A: T.handle, tensor: T.handle) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) # buffer definition - tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1) - A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1) - tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1) + tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1) + A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1) + tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=64, offset_factor=1) # body T.realize(tensor_1[0:1, 0:8, 0:8, 0:16], "") T.realize(tensor_2[0:1, 0:6, 0:12, 0:16], "") diff --git a/tests/python/unittest/test_tir_usmp_algo.py b/tests/python/unittest/test_tir_usmp_algo.py index 140f6d1b146e..f67148189d8c 100644 --- a/tests/python/unittest/test_tir_usmp_algo.py +++ b/tests/python/unittest/test_tir_usmp_algo.py @@ -299,9 +299,9 @@ class MobilenetStructure: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -311,10 +311,10 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True}) - placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_21 = T.match_buffer(T_cast_20, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_21 = T.match_buffer(T_cast_20, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_7 = T.allocate([157323], "int16", "global") for i0_i1_fused_7 in T.serial(0, 229): @@ -333,8 +333,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True}) - placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body tensor_2 = T.allocate([200704], "uint8", "global") for ax0_ax1_fused_4 in T.serial(0, 56): diff --git a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py index d4e62362495c..60360ecade70 100644 --- a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py +++ b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py @@ -111,9 +111,9 @@ class LinearStructure: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -123,10 +123,10 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True}) - placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_7 = T.allocate([157323], "int16", "global") for i0_i1_fused_7 in T.serial(0, 229): @@ -145,8 +145,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True}) - placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body tensor_2 = T.allocate([200704], "uint8", "global") for ax0_ax1_fused_4 in T.serial(0, 56): @@ -215,10 +215,10 @@ class ParallelSerialMixedForLoops: def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placeholder_68: T.handle, placeholder_69: T.handle, placeholder_70: T.handle, T_cast_22: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1", "tir.noalias": True}) - placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_8 = T.allocate([215296], "int16", "global") for i0_i1_fused_8 in T.serial(0, 58): @@ -256,10 +256,10 @@ class AllSerialForLoops: def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placeholder_68: T.handle, placeholder_69: T.handle, placeholder_70: T.handle, T_cast_22: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1", "tir.noalias": True}) - placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_8 = T.allocate([215296], "int16", "global") for i0_i1_fused_8 in T.serial(0, 58): @@ -338,8 +338,8 @@ class InceptionStructure: def tvmgen_default_fused_nn_max_pool2d(placeholder: T.handle, tensor: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d", "tir.noalias": True}) - placeholder_1 = T.match_buffer(placeholder, [602112], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - tensor_1 = T.match_buffer(tensor, [249], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_1 = T.match_buffer(placeholder, [602112], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + tensor_1 = T.match_buffer(tensor, [249], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused in T.serial(0, 28): for ax2 in T.serial(0, 28): @@ -352,9 +352,9 @@ def tvmgen_default_fused_nn_max_pool2d(placeholder: T.handle, tensor: T.handle) def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -364,8 +364,8 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def tvmgen_default_fused_cast(placeholder_6: T.handle, T_cast: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast", "tir.noalias": True}) - placeholder_7 = T.match_buffer(placeholder_6, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_1 = T.match_buffer(T_cast, [249], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_7 = T.match_buffer(placeholder_6, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_1 = T.match_buffer(T_cast, [249], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_2 in T.serial(0, 28): for ax2_2, ax3_outer_1, ax3_inner_2 in T.grid(28, 12, 16): @@ -375,11 +375,11 @@ def tvmgen_default_fused_cast(placeholder_6: T.handle, T_cast: T.handle) -> None def tvmgen_default_fused_concatenate(placeholder_8: T.handle, placeholder_9: T.handle, placeholder_10: T.handle, placeholder_11: T.handle, T_concat: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_concatenate", "tir.noalias": True}) - placeholder_12 = T.match_buffer(placeholder_8, [50176], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_concat_1 = T.match_buffer(T_concat, [313], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_13 = T.match_buffer(placeholder_9, [100352], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_14 = T.match_buffer(placeholder_11, [25088], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_15 = T.match_buffer(placeholder_10, [25088], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_12 = T.match_buffer(placeholder_8, [50176], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_concat_1 = T.match_buffer(T_concat, [313], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_13 = T.match_buffer(placeholder_9, [100352], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_14 = T.match_buffer(placeholder_11, [25088], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_15 = T.match_buffer(placeholder_10, [25088], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_3 in T.serial(0, 28): for ax2_3, ax3 in T.grid(28, 256): @@ -389,10 +389,10 @@ def tvmgen_default_fused_concatenate(placeholder_8: T.handle, placeholder_9: T.h def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(placeholder_16: T.handle, placeholder_17: T.handle, placeholder_18: T.handle, T_cast_2: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", "tir.noalias": True}) - placeholder_19 = T.match_buffer(placeholder_16, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_20 = T.match_buffer(placeholder_17, [4096], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_21 = T.match_buffer(placeholder_18, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_3 = T.match_buffer(T_cast_2, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_19 = T.match_buffer(placeholder_16, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_20 = T.match_buffer(placeholder_17, [4096], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_21 = T.match_buffer(placeholder_18, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_3 = T.match_buffer(T_cast_2, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body PaddedInput = T.allocate([200704], "int16", "global") for i0_i1_fused in T.serial(0, 56): @@ -411,10 +411,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(placeholder_22: T.handle, placeholder_23: T.handle, placeholder_24: T.handle, T_cast_4: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", "tir.noalias": True}) - placeholder_25 = T.match_buffer(placeholder_22, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_26 = T.match_buffer(placeholder_23, [18432], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_27 = T.match_buffer(placeholder_24, [96], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_5 = T.match_buffer(T_cast_4, [153], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_25 = T.match_buffer(placeholder_22, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_26 = T.match_buffer(placeholder_23, [18432], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_27 = T.match_buffer(placeholder_24, [96], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_5 = T.match_buffer(T_cast_4, [153], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_1 = T.allocate([150528], "int16", "global") for i0_i1_fused_1 in T.serial(0, 28): @@ -432,8 +432,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True}) - placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body tensor_2 = T.allocate([200704], "uint8", "global") for ax0_ax1_fused_4 in T.serial(0, 56): @@ -450,10 +450,10 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2(placeholder_30: T.handle, placeholder_31: T.handle, placeholder_32: T.handle, T_cast_8: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2", "tir.noalias": True}) - placeholder_33 = T.match_buffer(placeholder_30, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_34 = T.match_buffer(placeholder_31, [12288], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_35 = T.match_buffer(placeholder_32, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_9 = T.match_buffer(T_cast_8, [121], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_33 = T.match_buffer(placeholder_30, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_34 = T.match_buffer(placeholder_31, [12288], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_35 = T.match_buffer(placeholder_32, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_9 = T.match_buffer(T_cast_8, [121], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_2 = T.allocate([150528], "int16", "global") for i0_i1_fused_2 in T.serial(0, 28): @@ -472,8 +472,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2(placehol def tvmgen_default_fused_nn_max_pool2d_cast_1(placeholder_36: T.handle, T_cast_10: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast_1", "tir.noalias": True}) - placeholder_37 = T.match_buffer(placeholder_36, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_11 = T.match_buffer(T_cast_10, [249], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_37 = T.match_buffer(placeholder_36, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_11 = T.match_buffer(T_cast_10, [249], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body tensor_3 = T.allocate([150528], "uint8", "global") for ax0_ax1_fused_6 in T.serial(0, 28): @@ -490,10 +490,10 @@ def tvmgen_default_fused_nn_max_pool2d_cast_1(placeholder_36: T.handle, T_cast_1 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__2(placeholder_38: T.handle, placeholder_39: T.handle, placeholder_40: T.handle, T_cast_12: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__2", "tir.noalias": True}) - placeholder_41 = T.match_buffer(placeholder_38, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_42 = T.match_buffer(placeholder_39, [6144], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_43 = T.match_buffer(placeholder_40, [32], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_13 = T.match_buffer(T_cast_12, [89], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_41 = T.match_buffer(placeholder_38, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_42 = T.match_buffer(placeholder_39, [6144], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_43 = T.match_buffer(placeholder_40, [32], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_13 = T.match_buffer(T_cast_12, [89], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_3 = T.allocate([150528], "int16", "global") for i0_i1_fused_3 in T.serial(0, 28): @@ -511,10 +511,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(placeholder_44: T.handle, placeholder_45: T.handle, placeholder_46: T.handle, T_cast_14: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2", "tir.noalias": True}) - placeholder_47 = T.match_buffer(placeholder_44, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_48 = T.match_buffer(placeholder_45, [3072], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_49 = T.match_buffer(placeholder_46, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_15 = T.match_buffer(T_cast_14, [73], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_47 = T.match_buffer(placeholder_44, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_48 = T.match_buffer(placeholder_45, [3072], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_49 = T.match_buffer(placeholder_46, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_15 = T.match_buffer(T_cast_14, [73], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_4 = T.allocate([150528], "int16", "global") for i0_i1_fused_4 in T.serial(0, 28): @@ -532,10 +532,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(pla def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__1(placeholder_50: T.handle, placeholder_51: T.handle, placeholder_52: T.handle, T_cast_16: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__1", "tir.noalias": True}) - placeholder_53 = T.match_buffer(placeholder_50, [12544], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_54 = T.match_buffer(placeholder_51, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_55 = T.match_buffer(placeholder_52, [32], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_17 = T.match_buffer(T_cast_16, [89], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_53 = T.match_buffer(placeholder_50, [12544], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_54 = T.match_buffer(placeholder_51, [4608], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_55 = T.match_buffer(placeholder_52, [32], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_17 = T.match_buffer(T_cast_16, [89], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_5 = T.allocate([14400], "int16", "global") for i0_i1_fused_5 in T.serial(0, 30): @@ -553,10 +553,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320_(placeholder_56: T.handle, placeholder_57: T.handle, placeholder_58: T.handle, T_cast_18: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320_", "tir.noalias": True}) - placeholder_59 = T.match_buffer(placeholder_56, [75264], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_60 = T.match_buffer(placeholder_57, [110592], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_61 = T.match_buffer(placeholder_58, [128], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_19 = T.match_buffer(T_cast_18, [185], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_59 = T.match_buffer(placeholder_56, [75264], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_60 = T.match_buffer(placeholder_57, [110592], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_61 = T.match_buffer(placeholder_58, [128], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_19 = T.match_buffer(T_cast_18, [185], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_6 = T.allocate([86400], "int16", "global") for i0_i1_fused_6 in T.serial(0, 30): @@ -576,10 +576,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "T.noalias": True}) - placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_7 = T.allocate([157323], "int16", "global") for i0_i1_fused_7 in T.serial(0, 229): @@ -598,10 +598,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placeholder_68: T.handle, placeholder_69: T.handle, placeholder_70: T.handle, T_cast_22: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1", "tir.noalias": True}) - placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_71 = T.match_buffer(placeholder_68, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_72 = T.match_buffer(placeholder_69, [110592], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_8 = T.allocate([215296], "int16", "global") for i0_i1_fused_8 in T.serial(0, 58): diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py index 0a3e39b52f46..e6d123118757 100644 --- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py +++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py @@ -74,12 +74,12 @@ class LinearStructure: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(placeholder_4, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(placeholder_5, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(T_subtract_1, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(placeholder_4, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(placeholder_5, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(T_subtract_1, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -89,14 +89,14 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True}) - placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(placeholder_65, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(placeholder_66, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(placeholder_67, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(T_cast_21, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(placeholder_65, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(placeholder_66, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(placeholder_67, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(T_cast_21, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_7 = T.allocate([157323], "int16", "global") for i0_i1_fused_7 in T.serial(0, 229): @@ -115,10 +115,10 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True}) - placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(placeholder_29, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T.preflattened_buffer(T_cast_7, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(placeholder_29, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T.preflattened_buffer(T_cast_7, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body tensor_2 = T.allocate([200704], "uint8", "global") for ax0_ax1_fused_4 in T.serial(0, 56): diff --git a/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py b/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py index d72cb7f72ede..53a381c82b14 100644 --- a/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py +++ b/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py @@ -28,9 +28,9 @@ class SingleInputSingleOutput: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -40,8 +40,8 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def __tvm_main__(input: T.handle, output: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True}) - input_buffer_var = T.match_buffer(input, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - output_buffer_var = T.match_buffer(output, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + input_buffer_var = T.match_buffer(input, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + output_buffer_var = T.match_buffer(output, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input_buffer_var.data, T.lookup_param("p0", dtype="handle"), output_buffer_var.data, dtype="int32")) # fmt: on @@ -54,9 +54,9 @@ class TwoInputSingleOutput: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -66,9 +66,9 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def __tvm_main__(input1: T.handle, input2: T.handle, output: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True}) - input1_buffer_var = T.match_buffer(input1, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - input2_buffer_var = T.match_buffer(input2, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - output_buffer_var = T.match_buffer(output, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + input1_buffer_var = T.match_buffer(input1, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + input2_buffer_var = T.match_buffer(input2, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + output_buffer_var = T.match_buffer(output, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input1_buffer_var.data, input2_buffer_var.data, output_buffer_var.data, dtype="int32")) # fmt: on @@ -81,9 +81,9 @@ class TwoInputTwoOutput: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -93,10 +93,10 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def __tvm_main__(input1: T.handle, input2: T.handle, output1: T.handle, output2: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True}) - input1_buffer_var = T.match_buffer(input1, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - input2_buffer_var = T.match_buffer(input2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - output1_buffer_var = T.match_buffer(output1, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) - output2_buffer_var = T.match_buffer(output2, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + input1_buffer_var = T.match_buffer(input1, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + input2_buffer_var = T.match_buffer(input2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + output1_buffer_var = T.match_buffer(output1, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) + output2_buffer_var = T.match_buffer(output2, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input1_buffer_var.data, T.lookup_param("p0", dtype="handle"), output1_buffer_var.data, dtype="int32")) T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input2_buffer_var.data, T.lookup_param("p1", dtype="handle"), output2_buffer_var.data, dtype="int32")) @@ -110,9 +110,9 @@ class SingleInputTwoOutput: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -122,9 +122,9 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def __tvm_main__(input: T.handle, output1: T.handle, output2: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True}) - input_buffer_var = T.match_buffer(input, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - output1_buffer_var = T.match_buffer(output1, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) - output2_buffer_var = T.match_buffer(output2, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1) + input_buffer_var = T.match_buffer(input, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + output1_buffer_var = T.match_buffer(output1, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) + output2_buffer_var = T.match_buffer(output2, [452], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input_buffer_var.data, T.lookup_param("p0", dtype="handle"), output1_buffer_var.data, dtype="int32")) T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input_buffer_var.data, T.lookup_param("p1", dtype="handle"), output2_buffer_var.data, dtype="int32")) diff --git a/tests/python/unittest/test_tir_usmp_utils.py b/tests/python/unittest/test_tir_usmp_utils.py index 6e53bcb5e597..155ff0962def 100644 --- a/tests/python/unittest/test_tir_usmp_utils.py +++ b/tests/python/unittest/test_tir_usmp_utils.py @@ -31,9 +31,9 @@ class LinearStructure: def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True}) - placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1) - T_subtract_1 = T.match_buffer(T_subtract, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=64, offset_factor=1) + T_subtract_1 = T.match_buffer(T_subtract, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): @@ -43,10 +43,10 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True}) - placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1) - placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1) - T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1) + placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=64, offset_factor=1) + placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) + T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body PaddedInput_7 = T.allocate([157323], "int16", "global") for i0_i1_fused_7 in T.serial(0, 229): @@ -65,8 +65,8 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True}) - placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body tensor_2 = T.allocate([200704], "uint8", "global") for ax0_ax1_fused_4 in T.serial(0, 56): diff --git a/tests/python/unittest/test_tvmscript_complete.py b/tests/python/unittest/test_tvmscript_complete.py index c4b4afb24f82..29ac5dc5da0d 100644 --- a/tests/python/unittest/test_tvmscript_complete.py +++ b/tests/python/unittest/test_tvmscript_complete.py @@ -201,12 +201,12 @@ def func_with_bufferslice_indices(data: T.handle, index: T.handle) -> None: @T.prim_func def expected_bufferslice_indices(data: T.handle, index: T.handle) -> None: - index_buf = T.match_buffer(index, [1], dtype="int32", elem_offset=0, align=128, offset_factor=1) - data_buf = T.match_buffer(data, [16, 16], elem_offset=0, align=128, offset_factor=1) + index_buf = T.match_buffer(index, [1], dtype="int32", elem_offset=0, align=64, offset_factor=1) + data_buf = T.match_buffer(data, [16, 16], elem_offset=0, align=64, offset_factor=1) with T.block("root"): T.reads([]) T.writes([]) - out_buf = T.alloc_buffer([16, 16], elem_offset=0, align=128, offset_factor=1) + out_buf = T.alloc_buffer([16, 16], elem_offset=0, align=64, offset_factor=1) for i0, i1 in T.grid(16, 16): with T.block(): vi, vj = T.axis.remap("SS", [i0, i1]) @@ -229,12 +229,12 @@ def func_with_recursive_bufferslice_indices(data: T.handle, index: T.handle) -> @T.prim_func def expected_recursive_bufferslice_indices(data: T.handle, index: T.handle) -> None: - index_buf = T.match_buffer(index, [1], dtype="int32", elem_offset=0, align=128, offset_factor=1) - data_buf = T.match_buffer(data, [16, 16], elem_offset=0, align=128, offset_factor=1) + index_buf = T.match_buffer(index, [1], dtype="int32", elem_offset=0, align=64, offset_factor=1) + data_buf = T.match_buffer(data, [16, 16], elem_offset=0, align=64, offset_factor=1) with T.block("root"): T.reads([]) T.writes([]) - out_buf = T.alloc_buffer([16, 16], elem_offset=0, align=128, offset_factor=1) + out_buf = T.alloc_buffer([16, 16], elem_offset=0, align=64, offset_factor=1) for i0, i1 in T.grid(16, 16): with T.block(): vi, vj = T.axis.remap("SS", [i0, i1]) @@ -303,12 +303,12 @@ def alloc_buffer_func(a: T.handle, b: T.handle) -> None: @T.prim_func def expect_alloc_buffer_func(a: T.handle, b: T.handle) -> None: - A = T.match_buffer(a, [2, 2], dtype="float32", elem_offset=0, align=128, offset_factor=1) - B = T.match_buffer(b, [2, 2], dtype="float32", elem_offset=0, align=128, offset_factor=1) + A = T.match_buffer(a, [2, 2], dtype="float32", elem_offset=0, align=64, offset_factor=1) + B = T.match_buffer(b, [2, 2], dtype="float32", elem_offset=0, align=64, offset_factor=1) with T.block("root"): T.reads([]) T.writes([]) - C = T.alloc_buffer([2, 2], dtype="float32", elem_offset=0, align=128, offset_factor=1) + C = T.alloc_buffer([2, 2], dtype="float32", elem_offset=0, align=64, offset_factor=1) A[(0, 0)] = T.float32(2) C[(0, 0)] = A[(0, 0)] + B[(0, 0)] B[(0, 0)] = C[(0, 0)] diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py index e5f5ae752aac..e98f5057d8c4 100644 --- a/tests/python/unittest/test_tvmscript_roundtrip.py +++ b/tests/python/unittest/test_tvmscript_roundtrip.py @@ -34,11 +34,11 @@ def mmult(A: T.handle, B: T.handle, C: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "mmult", "tir.noalias": True}) # buffer definition - C_global = T.buffer_decl([1024, 1024], elem_offset=0, align=128, offset_factor=1) - packedB = T.buffer_decl([32, 1024, 32], elem_offset=0, align=128, offset_factor=1) - A_1 = T.match_buffer(A, [1024, 1024], elem_offset=0, align=128, offset_factor=1) - B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=128, offset_factor=1) - C_1 = T.match_buffer(C, [1024, 1024], elem_offset=0, align=128, offset_factor=1) + C_global = T.buffer_decl([1024, 1024], elem_offset=0, align=64, offset_factor=1) + packedB = T.buffer_decl([32, 1024, 32], elem_offset=0, align=64, offset_factor=1) + A_1 = T.match_buffer(A, [1024, 1024], elem_offset=0, align=64, offset_factor=1) + B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=64, offset_factor=1) + C_1 = T.match_buffer(C, [1024, 1024], elem_offset=0, align=64, offset_factor=1) # body T.realize(packedB[0:32, 0:1024, 0:32], "") for x in T.parallel(0, 32): @@ -90,9 +90,9 @@ class Module: def mmult(A: T.handle, B: T.handle, C: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "mmult", "tir.noalias": True}) - A_1 = T.match_buffer(A, [1024 * 1024], elem_offset=0, align=128, offset_factor=1) - B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=128, offset_factor=1) - C_1 = T.match_buffer(C, [1024 * 1024], elem_offset=0, align=128, offset_factor=1) + A_1 = T.match_buffer(A, [1024 * 1024], elem_offset=0, align=64, offset_factor=1) + B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=64, offset_factor=1) + C_1 = T.match_buffer(C, [1024 * 1024], elem_offset=0, align=64, offset_factor=1) # body packedB = T.allocate([32768], "float32", "global") for x in T.parallel(0, 32): @@ -484,10 +484,10 @@ def func(A: T.handle, W: T.handle, Conv: T.handle) -> None: tz = T.env_thread("threadIdx.z") # buffer definition Apad_shared = T.buffer_decl( - [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 + [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1 ) Apad_shared_wmma_matrix_a = T.buffer_decl( - [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 + [16, 16, 16, 16, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1 ) BA = T.buffer_decl( [16, 16], dtype="float16", scope="wmma.matrix_a", align=32, offset_factor=256 @@ -497,13 +497,13 @@ def func(A: T.handle, W: T.handle, Conv: T.handle) -> None: ) BC = T.buffer_decl([16, 16], scope="wmma.accumulator", align=32, offset_factor=256) Conv_wmma_accumulator = T.buffer_decl( - [16, 14, 14, 32, 16, 16], elem_offset=0, align=128, offset_factor=1 + [16, 14, 14, 32, 16, 16], elem_offset=0, align=64, offset_factor=1 ) W_shared = T.buffer_decl( - [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 + [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1 ) W_shared_wmma_matrix_b = T.buffer_decl( - [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 + [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1 ) buffer = T.buffer_decl( [16, 16], dtype="float16", scope="shared", align=32, offset_factor=256 @@ -520,13 +520,13 @@ def func(A: T.handle, W: T.handle, Conv: T.handle) -> None: buffer_4 = T.buffer_decl([16, 16], scope="wmma.accumulator", align=32, offset_factor=256) buffer_5 = T.buffer_decl([16, 16], align=32, offset_factor=256) A_1 = T.match_buffer( - A, [16, 14, 14, 16, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 + A, [16, 14, 14, 16, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1 ) W_1 = T.match_buffer( - W, [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=128, offset_factor=1 + W, [3, 3, 16, 32, 16, 16], dtype="float16", elem_offset=0, align=64, offset_factor=1 ) Conv_1 = T.match_buffer( - Conv, [16, 14, 14, 32, 16, 16], elem_offset=0, align=128, offset_factor=1 + Conv, [16, 14, 14, 32, 16, 16], elem_offset=0, align=64, offset_factor=1 ) # body T.realize(Conv_1[0:16, 0:14, 0:14, 0:32, 0:16, 0:16], "") @@ -2958,8 +2958,8 @@ def primfunc_with_allocate_annotations(): def primfunc_with_allocate_annotations(placeholder_28: T.handle, T_cast_6: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True}) - placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1) - T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=128, offset_factor=1) + placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) + T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body tensor_2 = T.allocate([200704], "uint8", "global", annotations={"attr1_key": "attr1_value"}) for ax0_ax1_fused_4 in T.serial(0, 56): diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py index 329a397724f3..d955ec0a8c80 100644 --- a/tests/python/unittest/test_tvmscript_syntax_sugar.py +++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py @@ -288,9 +288,9 @@ def shared_16x16_to_ldmatrix_32x8_layout(i, j): @T.prim_func def mma_sync_m16n16k16_desc(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (32, 8), "float16", align=128, offset_factor=16, scope="warp") - B = T.match_buffer(b, (32, 8), "float16", align=128, offset_factor=16, scope="warp") - C = T.match_buffer(c, (32, 8), "float16", align=128, offset_factor=16, scope="warp") + A = T.match_buffer(a, (32, 8), "float16", align=64, offset_factor=16, scope="warp") + B = T.match_buffer(b, (32, 8), "float16", align=64, offset_factor=16, scope="warp") + C = T.match_buffer(c, (32, 8), "float16", align=64, offset_factor=16, scope="warp") with T.block("root"): T.reads(C[0:32, 0:8], A[0:32, 0:8], B[0:32, 0:8]) @@ -315,9 +315,9 @@ def mma_sync_m16n16k16_desc(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def mma_sync_m16n16k16_desc_manual(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (32, 8), "float16", align=128, offset_factor=16, scope="warp") - B = T.match_buffer(b, (32, 8), "float16", align=128, offset_factor=16, scope="warp") - C = T.match_buffer(c, (32, 8), "float16", align=128, offset_factor=16, scope="warp") + A = T.match_buffer(a, (32, 8), "float16", align=64, offset_factor=16, scope="warp") + B = T.match_buffer(b, (32, 8), "float16", align=64, offset_factor=16, scope="warp") + C = T.match_buffer(c, (32, 8), "float16", align=64, offset_factor=16, scope="warp") with T.block("root"): T.reads(C[0:32, 0:8], A[0:32, 0:8], B[0:32, 0:8]) diff --git a/tests/python/unittest/test_tvmscript_type.py b/tests/python/unittest/test_tvmscript_type.py index 12954e31e5ec..8228363a95ac 100644 --- a/tests/python/unittest/test_tvmscript_type.py +++ b/tests/python/unittest/test_tvmscript_type.py @@ -25,13 +25,13 @@ @T.prim_func def element_wise_storage_align(a: T.handle, c: T.handle) -> None: - C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) - A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) + C = T.match_buffer(c, [128, 128], elem_offset=0, align=64, offset_factor=1) + A = T.match_buffer(a, [128, 128], elem_offset=0, align=64, offset_factor=1) # body with T.block("root"): T.reads([]) T.writes([]) - B = T.alloc_buffer([128, 128], elem_offset=0, align=128, offset_factor=1) + B = T.alloc_buffer([128, 128], elem_offset=0, align=64, offset_factor=1) for i0 in T.serial(0, 128): for ax1 in T.serial(0, 128): with T.block("B"): From 5db38ba8993d30ab0a89c82ff69e582d1bcc1678 Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Thu, 25 Aug 2022 21:06:54 +0100 Subject: [PATCH 050/704] [COMMUNITY] @cconvey -> Reviewer (#12598) --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index e3b4fe339a4f..1f9808ff2510 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -97,6 +97,7 @@ We do encourage everyone to work anything they are interested in. - [Zhi Chen](https://github.com/zhiics): @zhiics - [Valery Chernov](https://github.com/vvchernov): @vvchernov - [Neo Chien](https://github.com/cchung100m): @cchung100m +- [Christian Convey](https://github.com/cconvey/): @cconvey - [Meghan Cowan](https://github.com/cowanmeg): @cowanmeg - [Balint Cristian](https://github.com/cbalint13): @cbalint13 - [Egor Churaev](https://github.com/echuraev): @echuraev - metal From a9f7c32e42a5f09e641dbe83f81cc4a73869af12 Mon Sep 17 00:00:00 2001 From: Yizhi Liu Date: Thu, 25 Aug 2022 14:21:13 -0700 Subject: [PATCH 051/704] [skip ci][Community] Wuwei Lin -> PMC (#12605) [Community] Wuwei Lin -> PMC --- CONTRIBUTORS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 1f9808ff2510..771eb1c63eda 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -49,7 +49,7 @@ We do encourage everyone to work anything they are interested in. - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - relay - [Tristan Konolige](https://github.com/tkonolige): @tkonolige - profiling, relay, tir, runtime - [Ruihang Lai](https://github.com/MasterJH5574): @MasterJH5574 - tir, tvm-script -- [Wuwei Lin](https://github.com/vinx13): @vinx13 - relay, topi +- [Wuwei Lin](https://github.com/vinx13) (PMC): @vinx13 - relay, topi, tir, meta_schedule - [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay - [Hao Lu](https://github.com/hlu1): @hlu1 - nnpack, frontends - [Eric Lunderberg](https://github.com/Lunderberg): @Lunderberg - CI, Vulkan backend From 3224817d0835909c2673184a6c20bac3b7672632 Mon Sep 17 00:00:00 2001 From: WANG Zihan Date: Fri, 26 Aug 2022 14:19:19 +0800 Subject: [PATCH 052/704] [TOPI][Bugfix] Make semantics of empty `axis` in `squeeze` consistent with Relay (#12596) * Fix empty axis of `squeeze` in TOPI. * Add test case for `squeeze` with empty `axis`. * Add LLVM target for `test_squeeze`. --- include/tvm/topi/transform.h | 4 ++-- tests/python/topi/python/test_topi_transform.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h index 81935dd72dda..7accbf86912d 100644 --- a/include/tvm/topi/transform.h +++ b/include/tvm/topi/transform.h @@ -396,7 +396,7 @@ inline Tensor unravel_index(const Tensor& x, const Tensor& shape, std::string na * The removed dimensions must have a constant size of 1. * * \param x The input tensor - * \param axis Indices of the dimensions to remove. If this is empty, + * \param axis Indices of the dimensions to remove. If this is None, * all entries with a constant size of 1 will be removed. * \param atleast1d Whether the output need to be atleast1d. * \param name The name of the operation @@ -408,7 +408,7 @@ inline Tensor squeeze(const Tensor& x, Array axis, bool atleast1d = fal std::string name = "T_squeeze", std::string tag = kInjective) { auto ndim = x->shape.size(); std::vector axis_val; - if (!axis.defined() || axis.size() == 0) { + if (!axis.defined()) { for (size_t i = 0; i < ndim; ++i) { if (IsConstInt(x->shape[i]) && GetConstInt(x->shape[i]) == 1) { axis_val.push_back(static_cast(i)); diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py index c3155c948a8d..dd5ad1b11926 100644 --- a/tests/python/topi/python/test_topi_transform.py +++ b/tests/python/topi/python/test_topi_transform.py @@ -940,18 +940,19 @@ def test_where(): verify_where((1, 2, 3, 4)) -@tvm.testing.requires_gpu +@tvm.testing.uses_gpu def test_squeeze(): verify_squeeze((1, 2, 3, 4), 0) verify_squeeze((1, 2, 1, 4), None) verify_squeeze((1, 1, 1, 4), (1, 2)) verify_squeeze((1, 1, 1, 1), None) + verify_squeeze((1, 1, 1, 1), ()) # a special case to trigger inline let expression A = te.placeholder((2,), "float32", "A") E = topi.squeeze(A) C = te.compute((1,), lambda i: E[(2 * A[0] - 1).astype("int32")]) - for target in ["cuda", "opencl"]: + for target in ["llvm", "cuda", "opencl"]: dev = tvm.device(target, 0) if tvm.testing.device_enabled(target): with tvm.target.Target(target): From 4f431c87c2b8bb5ea0773c44d92658e506251dda Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Fri, 26 Aug 2022 02:30:38 -0700 Subject: [PATCH 053/704] [TIR] Expose Memory Copy-Related PTX Builtins (#12611) * Expose Memory Copy-Related PTX Builtins This PR exposes the following TIR operation in python: `ptx_ldmatrix`: tested `ptx_cp_async`: tested `ptx_commit_group`: tested `ptx_wait_group`: tested Co-authored-by: yongwww * apply code review suggestion Co-authored-by: yongwww --- python/tvm/tir/__init__.py | 1 + python/tvm/tir/op.py | 111 +++++++++++++++++++++ tests/python/unittest/test_tir_op_types.py | 54 +++++----- 3 files changed, 140 insertions(+), 26 deletions(-) diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py index 04ab7f80daa9..4a6f32d03a2b 100644 --- a/python/tvm/tir/__init__.py +++ b/python/tvm/tir/__init__.py @@ -59,6 +59,7 @@ tvm_bmma_sync, tvm_fill_fragment, ) +from .op import ptx_ldmatrix, ptx_cp_async, ptx_commit_group, ptx_wait_group from .op import vectorlow, vectorhigh, vectorcombine from .op import infinity, reinterpret from .op import exp, exp2, exp10, log, log2, log10, log1p, ldexp, clz diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py index cf7985e8f489..e510f68a68a1 100644 --- a/python/tvm/tir/op.py +++ b/python/tvm/tir/op.py @@ -831,6 +831,117 @@ def tvm_store_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout): ) +def ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr, smem_offset): + """TVM intrinsic for ptx load matrix from shared memory + https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix + + Parameters + ---------- + dtype : str + The data type of the result. + + trans : bool + The matrix is loaded in column-major format. + + num : IntImm + The number of matrices. + + type : Literal[".b16"] + The data type of the matrices. + + local_ptr : Var + The local pointer variable. + + local_offset : Expr + The offset of local pointer. + + smem_ptr : Var + The shared memory pointer variable. + + smem_offset : Expr + The offset of shared memort pointer. + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin( + dtype, + "tir.ptx_ldmatrix", + trans, + num, + type, + local_ptr, + local_offset, + smem_ptr, + smem_offset, + ) + + +def ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes): + """TVM intrinsic for ptx async copy from global to shared memory + https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async + + Parameters + ---------- + dtype : str + The data type of the result. + + shared_ptr : Var + The shared memory pointer variable. + + shared_offset : Expr + The offset of shared memory pointer. + + global_ptr : Var + The global memory pointer variable. + + global_offset : Expr + The offset of global memory pointer. + + bytes : int + The data size to copy. + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin( + dtype, "tir.ptx_cp_async", shared_ptr, shared_offset, global_ptr, global_offset, bytes + ) + + +def ptx_commit_group(): + """TVM intrinsic for ptx async copy commit + https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin("", "tir.ptx_commit_group") + + +def ptx_wait_group(num): + """TVM intrinsic for ptx async copy wait + https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-wait-group + + Parameters + ---------- + num : int + The number of the most recent uncommitted pending cp.async groups to wait. + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin("", "tir.ptx_wait_group", num) + + def vectorlow(dtype, vec): """Get the low level half of the vector diff --git a/tests/python/unittest/test_tir_op_types.py b/tests/python/unittest/test_tir_op_types.py index 5254e7326e24..f8e8de074c42 100644 --- a/tests/python/unittest/test_tir_op_types.py +++ b/tests/python/unittest/test_tir_op_types.py @@ -16,6 +16,7 @@ # under the License. # pylint: disable=missing-docstring import tvm +import tvm.testing from tvm import tir @@ -142,6 +143,32 @@ def test_tir_op_tvm_fill_fragment(): assert expr.op.name == "tir.tvm_fill_fragment" +def test_op_ptx_ldmatrix(): + buffer_shared = tir.decl_buffer([16, 16], "float16", scope="shared") + buffer_local = tir.decl_buffer([8], "float16", scope="local") + expr = tir.ptx_ldmatrix( + "float16", False, 4, ".b16", buffer_local.data, 0, buffer_shared.data, 0 + ) + assert expr.op.name == "tir.ptx_ldmatrix" + + +def test_op_ptx_cp_async(): + buffer_shared = tir.decl_buffer([16, 16], "float16", scope="shared") + buffer_local = tir.decl_buffer([8], "float16", scope="local") + expr = tir.ptx_cp_async("float16", buffer_shared.data, 0, buffer_local.data, 0, 16) + assert expr.op.name == "tir.ptx_cp_async" + + +def test_op_ptx_commit_group(): + expr = tir.ptx_commit_group() + assert expr.op.name == "tir.ptx_commit_group" + + +def test_op_ptx_wait_group(): + expr = tir.ptx_wait_group(8) + assert expr.op.name == "tir.ptx_wait_group" + + def test_tir_op_vectorlow(): buffer = tir.decl_buffer((4, 4), "int8", offset_factor=1) vec = buffer.vload([0, 0], dtype="int8x16") @@ -189,29 +216,4 @@ def test_tir_op_TVMBackendFreeWorkspace(): if __name__ == "__main__": - test_tir_op_tvm_tuple() - test_tir_op_tvm_struct_get() - test_tir_op_tvm_struct_set() - test_tir_op_address_of() - test_tir_op_lookup_param() - test_tir_op_reinterpret() - test_tir_op_isnullptr() - test_tir_op_call_assume() - test_tir_op_call_undef() - test_tir_op_call_likely() - test_tir_op_tvm_thread_allreduce() - test_tir_op_type_annotation() - test_tir_op_tvm_access_ptr() - test_tir_op_tvm_throw_last_error() - test_tir_op_tvm_load_matrix_sync(), - test_tir_op_tvm_store_matrix_sync(), - test_tir_op_tvm_mma_sync(), - test_tir_op_tvm_bmma_sync(), - test_tir_op_tvm_fill_fragment(), - test_tir_op_vectorlow() - test_tir_op_vectorhigh() - test_tir_op_vectorcombine() - test_tir_op_shift_left() - test_tir_op_shift_right() - test_tir_op_TVMBackendAllocWorkspace() - test_tir_op_TVMBackendFreeWorkspace() + tvm.testing.main() From e02f2f9fddd8cd38589e3569c41de9f7af39971c Mon Sep 17 00:00:00 2001 From: "yin.changsheng" Date: Fri, 26 Aug 2022 19:42:57 +0800 Subject: [PATCH 054/704] [TIR][Schedule] enhance compute_at and reverse_compute_at primitive to choose possible position (#12450) Current TIR "compute_at" primitive will compute at it's closest consumers. When a block has multiple producers, whoever compute at later who is behind. But for some special hardware, we usually hope keep the a certain order whatever it's compute at early or late. eg: block A and block B are producers of block C. block A compute at block C first and block B compute at block C later. We hope the result is block B->block A->block C under some loop var. --- include/tvm/tir/schedule/schedule.h | 14 +- python/tvm/tir/schedule/schedule.py | 16 ++ src/tir/schedule/concrete_schedule.cc | 8 +- src/tir/schedule/concrete_schedule.h | 7 +- src/tir/schedule/primitive.h | 13 +- src/tir/schedule/primitive/compute_at.cc | 67 +++++--- src/tir/schedule/traced_schedule.cc | 19 +-- src/tir/schedule/traced_schedule.h | 7 +- ...le_schedule_rule_cross_thread_reduction.py | 16 +- ...hedule_schedule_rule_multi_level_tiling.py | 86 +++++----- ...e_schedule_rule_random_compute_location.py | 2 +- .../unittest/test_tir_schedule_compute_at.py | 152 ++++++++++++++++++ 12 files changed, 308 insertions(+), 99 deletions(-) diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h index 11fec642c718..da399ab976d6 100644 --- a/include/tvm/tir/schedule/schedule.h +++ b/include/tvm/tir/schedule/schedule.h @@ -432,9 +432,13 @@ class ScheduleNode : public runtime::Object { * \param block_rv The block to be moved * \param loop_rv The loop where the block to be moved under * \param preserve_unit_loops Whether to keep the trivial loops whose extents are 1 + * \param index The block index of the loop body subtree blocks: + * - `index = -1` means inserted into the last possible insertion point; + * - `index = -2` means inserted into the first possible insertion point; + * - Otherwise, `index` is a nonnegative number that indicates the insertion point */ - virtual void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, - bool preserve_unit_loops) = 0; + virtual void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops, + int index = -1) = 0; /*! * \brief Move a consumer block under the specific loop, and regenerate the * loops induced by the block so that the buffer region consumed by the consumer block could @@ -449,9 +453,13 @@ class ScheduleNode : public runtime::Object { * \param block_rv The block to be moved * \param loop_rv The loop where the block to be moved under * \param preserve_unit_loops Whether to keep the trivial loops whose extents are 1 + * \param index The block index of the loop body subtree blocks: + * - `index = -1` means inserted into the last possible insertion point; + * - `index = -2` means inserted into the first possible insertion point; + * - Otherwise, `index` is a nonnegative number that indicates the insertion point */ virtual void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, - bool preserve_unit_loops) = 0; + bool preserve_unit_loops, int index = -1) = 0; /*! * \brief Inline a block into its consumer(s). It requires: * 1) The block is a complete non-root block, which only produces one buffer diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py index e18bee35a5e1..04cc1bc26ad1 100644 --- a/python/tvm/tir/schedule/schedule.py +++ b/python/tvm/tir/schedule/schedule.py @@ -1274,6 +1274,7 @@ def compute_at( block: Union[BlockRV, str], loop: LoopRV, preserve_unit_loops: bool = False, + index: int = -1, ) -> None: """Compute-At. Move a producer block under the specific loop, and regenerate the loops induced by the block so that the buffer region produced by the producer block could @@ -1303,6 +1304,12 @@ def compute_at( preserve_unit_loops: bool Whether to keep the trivial loops whose extents are 1 + index: int + The block index of the loop body subtree blocks: + - `index = -1` means inserted into the last possible insertion point; + - `index = -2` means inserted into the first possible insertion point; + - Otherwise, `index` is a nonnegative number that indicates the insertion point + Examples -------- @@ -1360,6 +1367,7 @@ def after_compute_at(a: T.handle, c: T.handle) -> None: block, loop, preserve_unit_loops, + index, ) @type_checked @@ -1368,6 +1376,7 @@ def reverse_compute_at( block: Union[BlockRV, str], loop: LoopRV, preserve_unit_loops: bool = False, + index: int = -1, ) -> None: """Reverse-Compute-At. Move a consumer block under the specific loop, and regenerate the loops induced by the block so that the buffer region consumed by the consumer block could @@ -1394,6 +1403,12 @@ def reverse_compute_at( preserve_unit_loops: bool Whether to keep the trivial loops whose extents are 1 + index: int + The block index of the loop body subtree blocks: + - `index = -1` means inserted into the last possible insertion point; + - `index = -2` means inserted into the first possible insertion point; + - Otherwise, `index` is a nonnegative number that indicates the insertion point + Examples -------- @@ -1451,6 +1466,7 @@ def after_reverse_compute_at(a: T.handle, c: T.handle) -> None: block, loop, preserve_unit_loops, + index, ) @type_checked diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc index c16638f748b4..5f773a02d6ff 100644 --- a/src/tir/schedule/concrete_schedule.cc +++ b/src/tir/schedule/concrete_schedule.cc @@ -574,7 +574,7 @@ BlockRV ConcreteScheduleNode::ReIndex(const BlockRV& block_rv, int buffer_index, /******** Schedule: Compute location ********/ void ConcreteScheduleNode::ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, - bool preserve_unit_loops) { + bool preserve_unit_loops, int index) { static StmtSRef inline_mark = StmtSRef::InlineMark(); static StmtSRef root_mark = StmtSRef::RootMark(); StmtSRef loop_sref = this->GetSRef(loop_rv); @@ -586,14 +586,14 @@ void ConcreteScheduleNode::ComputeAt(const BlockRV& block_rv, const LoopRV& loop TVM_TIR_SCHEDULE_END("compute-at", this->error_render_level_); } else { TVM_TIR_SCHEDULE_BEGIN(); - tir::ComputeAt(state_, this->GetSRef(block_rv), loop_sref, preserve_unit_loops); + tir::ComputeAt(state_, this->GetSRef(block_rv), loop_sref, preserve_unit_loops, index); TVM_TIR_SCHEDULE_END("compute-at", this->error_render_level_); } this->state_->DebugVerify(); } void ConcreteScheduleNode::ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, - bool preserve_unit_loops) { + bool preserve_unit_loops, int index) { static StmtSRef inline_mark = StmtSRef::InlineMark(); static StmtSRef root_mark = StmtSRef::RootMark(); StmtSRef loop_sref = this->GetSRef(loop_rv); @@ -605,7 +605,7 @@ void ConcreteScheduleNode::ReverseComputeAt(const BlockRV& block_rv, const LoopR TVM_TIR_SCHEDULE_END("reverse-compute-at", this->error_render_level_); } else { TVM_TIR_SCHEDULE_BEGIN(); - tir::ReverseComputeAt(state_, this->GetSRef(block_rv), loop_sref, preserve_unit_loops); + tir::ReverseComputeAt(state_, this->GetSRef(block_rv), loop_sref, preserve_unit_loops, index); TVM_TIR_SCHEDULE_END("reverse-compute-at", this->error_render_level_); } this->state_->DebugVerify(); diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h index cdd0a5b7b0a2..92b9de408873 100644 --- a/src/tir/schedule/concrete_schedule.h +++ b/src/tir/schedule/concrete_schedule.h @@ -119,9 +119,10 @@ class ConcreteScheduleNode : public ScheduleNode { BlockRV ReIndex(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type) override; /******** Schedule: Compute location ********/ - void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops) override; - void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, - bool preserve_unit_loops) override; + void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops, + int index = -1) override; + void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops, + int index = -1) override; void ComputeInline(const BlockRV& block) override; void ReverseComputeInline(const BlockRV& block) override; /******** Schedule: Reduction ********/ diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h index 14203a0d167e..05d9e4cf944a 100644 --- a/src/tir/schedule/primitive.h +++ b/src/tir/schedule/primitive.h @@ -299,10 +299,13 @@ TVM_DLL StmtSRef ReIndex(ScheduleState self, const StmtSRef& block_sref, int buf * \param self The schedule state * \param block_sref The block to be moved * \param loop_sref The loop where the block to be moved to - * \param preserve_unit_loops Whether to keep the trivial loops whose extents are 1 + * \param index The block index of the loop body subtree blocks: + * - `index = -1` means inserted into the last possible insertion point; + * - `index = -2` means inserted into the first possible insertion point; + * - Otherwise, `index` is a nonnegative number that indicates the insertion point */ TVM_DLL void ComputeAt(ScheduleState self, const StmtSRef& block_sref, const StmtSRef& loop_sref, - bool preserve_unit_loops); + bool preserve_unit_loops, int index = -1); /*! * \brief Move a consumer block under the specific loop, and regenerate the * loops induced by the block so that the buffer region consumed by the consumer block could @@ -318,9 +321,13 @@ TVM_DLL void ComputeAt(ScheduleState self, const StmtSRef& block_sref, const Stm * \param block_sref The block to be moved * \param loop_sref The loop where the block to be moved to * \param preserve_unit_loops Whether to keep the trivial loops whose extents are 1 + * \param index The block index of the loop body subtree blocks: + * - `index = -1` means inserted into the last possible insertion point; + * - `index = -2` means inserted into the first possible insertion point; + * - Otherwise, `index` is a nonnegative number that indicates the insertion point */ TVM_DLL void ReverseComputeAt(ScheduleState self, const StmtSRef& block_sref, - const StmtSRef& loop_sref, bool preserve_unit_loops); + const StmtSRef& loop_sref, bool preserve_unit_loops, int index = -1); /*! * \brief Inline a block into its consumer(s). It requires: * 1) The block is a complete non-root block, which only produces one buffer diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc index 98a6b2400ee3..8baedfd70dd0 100644 --- a/src/tir/schedule/primitive/compute_at.cc +++ b/src/tir/schedule/primitive/compute_at.cc @@ -129,15 +129,19 @@ class NotInSameScopeError : public ScheduleError { * \param producer_srefs The producer blocks * \param consumer_srefs The consumer blocks * \param block2realize A cache that maps a block to its realize - * \return The last position the new block can be inserted onto, and the + * \param index The block index of the loop body subtree blocks: + * - `index = -1` means inserted into the last possible insertion point; + * - `index = -2` means inserted into the first possible insertion point; + * - Otherwise, `index` is a nonnegative number that indicates the insertion point + * \return The possible position the new block can be inserted into, and the * producer-consumer-relationship is still satisfied. * \throws ScheduleError if there is no such insertion point found */ template -int FindInsertionPoint( - const ScheduleState& self, const Array& subtrees, const Array& producer_srefs, - const Array& consumer_srefs, - std::unordered_map* block2realize) { +int FindInsertionPoint(const ScheduleState& self, const Array& subtrees, + const Array& producer_srefs, const Array& consumer_srefs, + std::unordered_map* block2realize, + int index) { ProducerConsumerSplit split = ProducerConsumerSplit::Find(self, subtrees, producer_srefs, consumer_srefs, block2realize); // Step 1. Check if all the producers are visited in the subtrees, if required to @@ -159,8 +163,22 @@ int FindInsertionPoint( // Step 3. Check if there is at least one index of the position can be inserted into // The valid indices are: (last_producer_position, first_consumer_position] ICHECK(split.last_producer_position < split.first_consumer_position); - // Step 4. Return the last valid insertion point - return split.first_consumer_position; + // Step 4. Return the possible insertion point according to index + int insert_position; + if (index == -1) { + insert_position = split.first_consumer_position; + } else if (index == -2) { + insert_position = split.last_producer_position + 1; + } else if (index >= 0 && index >= split.last_producer_position + 1 && + index <= split.first_consumer_position) { + insert_position = index; + } else { + LOG(FATAL) << "Valid index:(-1, -2, [" << split.last_producer_position + 1 << ", " + << split.first_consumer_position << "]), " + << "current index=" << index; + throw; + } + return insert_position; } /*! @@ -556,7 +574,8 @@ void CalculateProvidedRequiredRegions( template void ComputeAtOrReverseComputeAtImpl(ScheduleState self, const StmtSRef& block_sref, const StmtSRef& loop_sref, bool preserve_unit_loops, - arith::Analyzer* analyzer, bool check_only = false) { + arith::Analyzer* analyzer, bool check_only = false, + int index = -1) { const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); // Step 1. Bunch of checks @@ -588,7 +607,8 @@ void ComputeAtOrReverseComputeAtImpl(ScheduleState self, const StmtSRef& block_s /*self=*/self, /*subtrees=*/AsArray(loop->body), /*producer_srefs=*/producer_srefs, - /*consumer_srefs=*/consumer_srefs, /*block2realize=*/&block2realize); + /*consumer_srefs=*/consumer_srefs, /*block2realize=*/&block2realize, + /*index=*/index); // Step 4. Calculate the region provided by a single execution instance of `block`, // as well as the region required by dependent blocks under `loop`. // Here is the definition of `provide` and `require`: @@ -626,17 +646,17 @@ void ComputeAtOrReverseComputeAtImpl(ScheduleState self, const StmtSRef& block_s } void ComputeAt(ScheduleState self, const StmtSRef& block_sref, const StmtSRef& loop_sref, - bool preserve_unit_loops) { + bool preserve_unit_loops, int index) { arith::Analyzer analyzer; - ComputeAtOrReverseComputeAtImpl(self, block_sref, loop_sref, preserve_unit_loops, - &analyzer); + ComputeAtOrReverseComputeAtImpl(self, block_sref, loop_sref, preserve_unit_loops, &analyzer, + false, index); } void ReverseComputeAt(ScheduleState self, const StmtSRef& block_sref, const StmtSRef& loop_sref, - bool preserve_unit_loops) { + bool preserve_unit_loops, int index) { arith::Analyzer analyzer; ComputeAtOrReverseComputeAtImpl(self, block_sref, loop_sref, preserve_unit_loops, - &analyzer); + &analyzer, false, index); } bool CanComputeAt(const ScheduleState& self, const StmtSRef& block_sref, const StmtSRef& loop_sref, @@ -671,20 +691,21 @@ struct ComputeAtTraits : public UnpackedInstTraits { private: static constexpr size_t kNumInputs = 2; - static constexpr size_t kNumAttrs = 1; + static constexpr size_t kNumAttrs = 2; static constexpr size_t kNumDecisions = 0; static void UnpackedApplyToSchedule(Schedule sch, BlockRV block_rv, LoopRV loop_rv, - Bool preserve_unit_loops) { - return sch->ComputeAt(block_rv, loop_rv, preserve_unit_loops.operator bool()); + Bool preserve_unit_loops, IntImm index) { + return sch->ComputeAt(block_rv, loop_rv, preserve_unit_loops.operator bool(), index->value); } static String UnpackedAsPython(Array outputs, String block_rv, String loop_rv, - Bool preserve_unit_loops) { + Bool preserve_unit_loops, IntImm index) { PythonAPICall py("compute_at"); py.Input("block", block_rv); py.Input("loop", loop_rv); py.Input("preserve_unit_loops", preserve_unit_loops.operator bool()); + py.Input("index", index); return py.Str(); } @@ -698,20 +719,22 @@ struct ReverseComputeAtTraits : public UnpackedInstTraitsReverseComputeAt(block_rv, loop_rv, preserve_unit_loops.operator bool()); + Bool preserve_unit_loops, IntImm index) { + return sch->ReverseComputeAt(block_rv, loop_rv, preserve_unit_loops.operator bool(), + index->value); } static String UnpackedAsPython(Array outputs, String block_rv, String loop_rv, - Bool preserve_unit_loops) { + Bool preserve_unit_loops, IntImm index) { PythonAPICall py("reverse_compute_at"); py.Input("block", block_rv); py.Input("loop", loop_rv); py.Input("preserve_unit_loops", preserve_unit_loops.operator bool()); + py.Input("index", index); return py.Str(); } diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc index 07d4da54d7fb..04ddc0507dc4 100644 --- a/src/tir/schedule/traced_schedule.cc +++ b/src/tir/schedule/traced_schedule.cc @@ -322,24 +322,25 @@ BlockRV TracedScheduleNode::ReIndex(const BlockRV& block_rv, int buffer_index, /******** Schedule: Compute location ********/ void TracedScheduleNode::ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, - bool preserve_unit_loops) { - ConcreteScheduleNode::ComputeAt(block_rv, loop_rv, preserve_unit_loops); + bool preserve_unit_loops, int index) { + ConcreteScheduleNode::ComputeAt(block_rv, loop_rv, preserve_unit_loops, index); static const InstructionKind& kind = InstructionKind::Get("ComputeAt"); - trace_->Append(/*inst=*/Instruction(/*kind=*/kind, - /*inputs=*/{block_rv, loop_rv}, - /*attrs=*/{Integer(preserve_unit_loops)}, - /*outputs=*/{})); + trace_->Append( + /*inst=*/Instruction(/*kind=*/kind, + /*inputs=*/{block_rv, loop_rv}, + /*attrs=*/{Integer(preserve_unit_loops), Integer(index)}, + /*outputs=*/{})); } void TracedScheduleNode::ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, - bool preserve_unit_loops) { - ConcreteScheduleNode::ReverseComputeAt(block_rv, loop_rv, preserve_unit_loops); + bool preserve_unit_loops, int index) { + ConcreteScheduleNode::ReverseComputeAt(block_rv, loop_rv, preserve_unit_loops, index); static const InstructionKind& kind = InstructionKind::Get("ReverseComputeAt"); trace_->Append(/*inst=*/Instruction(/*kind=*/kind, /*inputs=*/{block_rv, loop_rv}, - /*attrs=*/{Integer(preserve_unit_loops)}, + /*attrs=*/{Integer(preserve_unit_loops), Integer(index)}, /*outputs=*/{})); } diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h index 865a21687950..d98e4ba4bb95 100644 --- a/src/tir/schedule/traced_schedule.h +++ b/src/tir/schedule/traced_schedule.h @@ -79,9 +79,10 @@ class TracedScheduleNode : public ConcreteScheduleNode { BlockRV ReIndex(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type) final; /******** Schedule: Compute location ********/ - void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops) final; - void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, - bool preserve_unit_loops) final; + void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops, + int index = -1) final; + void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops, + int index = -1) final; void ComputeInline(const BlockRV& block_rv) final; void ReverseComputeInline(const BlockRV& block_rv) final; /******** Schedule: Reduction ********/ diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py index 5f76e77592e3..592d32d6245d 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py @@ -80,7 +80,7 @@ def test_gpu_softmax_mn(): "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)", 'sch.bind(loop=l6, thread_axis="threadIdx.x")', - "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True)", + "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True, index=-1)", 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', "l7, l8, l9 = sch.get_loops(block=b0)", "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)", @@ -93,7 +93,7 @@ def test_gpu_softmax_mn(): "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)", 'sch.bind(loop=l6, thread_axis="threadIdx.x")', - "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True)", + "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True, index=-1)", 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', "l7, l8, l9 = sch.get_loops(block=b0)", "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)", @@ -107,7 +107,7 @@ def test_gpu_softmax_mn(): "v5 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", "l6, l7 = sch.split(loop=l4, factors=[None, v5], preserve_unit_iters=True)", 'sch.bind(loop=l7, thread_axis="threadIdx.x")', - "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True)", + "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True, index=-1)", 'sch.set_scope(block=b1, buffer_index=0, storage_scope="shared")', "l8, l9, l10 = sch.get_loops(block=b1)", "l11, l12 = sch.split(loop=l10, factors=[None, v5], preserve_unit_iters=True)", @@ -117,7 +117,7 @@ def test_gpu_softmax_mn(): "v16 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", "l17, l18 = sch.split(loop=l15, factors=[None, v16], preserve_unit_iters=True)", 'sch.bind(loop=l18, thread_axis="threadIdx.x")', - "sch.compute_at(block=b0, loop=l14, preserve_unit_loops=True)", + "sch.compute_at(block=b0, loop=l14, preserve_unit_loops=True, index=-1)", 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', "l19, l20, l21 = sch.get_loops(block=b0)", "l22, l23 = sch.split(loop=l21, factors=[None, v16], preserve_unit_iters=True)", @@ -157,7 +157,7 @@ def test_gpu_softmax_mn_after_inline(): "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)", 'sch.bind(loop=l6, thread_axis="threadIdx.x")', - "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True)", + "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True, index=-1)", 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', "l7, l8, l9 = sch.get_loops(block=b0)", "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)", @@ -171,14 +171,14 @@ def test_gpu_softmax_mn_after_inline(): "v5 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", "l6, l7 = sch.split(loop=l4, factors=[None, v5], preserve_unit_iters=True)", 'sch.bind(loop=l7, thread_axis="threadIdx.x")', - "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True)", + "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True, index=-1)", 'sch.set_scope(block=b1, buffer_index=0, storage_scope="shared")', "l8, l9, l10 = sch.get_loops(block=b1)", "l11, l12 = sch.split(loop=l10, factors=[None, v5], preserve_unit_iters=True)", 'sch.bind(loop=l12, thread_axis="threadIdx.x")', "b13, b14 = sch.get_consumers(block=b0)", "l15, l16, l17, l18 = sch.get_loops(block=b13)", - "sch.compute_at(block=b0, loop=l15, preserve_unit_loops=True)", + "sch.compute_at(block=b0, loop=l15, preserve_unit_loops=True, index=-1)", 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', "l19, l20, l21 = sch.get_loops(block=b0)", "l22, l23 = sch.split(loop=l21, factors=[None, v5], preserve_unit_iters=True)", @@ -206,7 +206,7 @@ def test_gpu_batch_norm_bmn(): "v3 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", "l4, l5 = sch.split(loop=l2, factors=[None, v3], preserve_unit_iters=True)", 'sch.bind(loop=l5, thread_axis="threadIdx.x")', - "sch.compute_at(block=b0, loop=l4, preserve_unit_loops=True)", + "sch.compute_at(block=b0, loop=l4, preserve_unit_loops=True, index=-1)", 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', "l6, l7, l8, l9 = sch.get_loops(block=b0)", "l10 = sch.fuse(l8, l9, preserve_unit_iters=True)", diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py index 87159fcb3110..fe1220c50925 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py @@ -62,7 +62,7 @@ def test_cpu_matmul(): "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)", "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)", 'b24 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="global")', - "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True)", + "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True, index=-1)", ], [ 'b0 = sch.get_block(name="C", func_name="main")', @@ -76,7 +76,7 @@ def test_cpu_matmul(): "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)", "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)", 'b24 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="global")', - "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True)", + "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True, index=-1)", ], [ 'b0 = sch.get_block(name="C", func_name="main")', @@ -123,7 +123,7 @@ def test_cpu_matmul_relu(): "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)", "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)", "b24, = sch.get_consumers(block=b0)", - "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True)", + "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True, index=-1)", ], [ 'b0 = sch.get_block(name="C", func_name="main")', @@ -137,7 +137,7 @@ def test_cpu_matmul_relu(): "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)", "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)", "b24, = sch.get_consumers(block=b0)", - "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True)", + "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True, index=-1)", ], [ 'b0 = sch.get_block(name="C", func_name="main")', @@ -193,15 +193,15 @@ def test_cuda_matmul(): 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32)', 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024)', 'b33 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")', - "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True)", + "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True, index=-1)", 'b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")', - "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True)", + "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True, index=-1)", "l35, l36, l37, l38, l39, l40 = sch.get_loops(block=b34)", "l41 = sch.fuse(l39, l40, preserve_unit_iters=True)", "v42 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])", 'sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v42)', 'b43 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")', - "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True)", + "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True, index=-1)", "l44, l45, l46, l47, l48, l49 = sch.get_loops(block=b43)", "l50 = sch.fuse(l48, l49, preserve_unit_iters=True)", "v51 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])", @@ -247,15 +247,15 @@ def test_cuda_matmul_relu(): "l32 = sch.fuse(l11, l21, preserve_unit_iters=True)", 'sch.bind(loop=l32, thread_axis="threadIdx.x")', 'b33 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")', - "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True)", + "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True, index=-1)", 'b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")', - "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True)", + "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True, index=-1)", "l35, l36, l37, l38, l39, l40 = sch.get_loops(block=b34)", "l41 = sch.fuse(l39, l40, preserve_unit_iters=True)", "v42 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])", 'sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v42)', 'b43 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")', - "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True)", + "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True, index=-1)", "l44, l45, l46, l47, l48, l49 = sch.get_loops(block=b43)", "l50 = sch.fuse(l48, l49, preserve_unit_iters=True)", "v51 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])", @@ -402,7 +402,7 @@ def test_multi_level_tiling_conv2d_nchwc_vnni(): l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True) sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77) b98 = sch.cache_write(block=b27, write_buffer_index=0, storage_scope="global") -sch.reverse_compute_at(block=b98, loop=l75, preserve_unit_loops=True)""".split( +sch.reverse_compute_at(block=b98, loop=l75, preserve_unit_loops=True, index=-1)""".split( "\n" ), """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main") @@ -437,7 +437,7 @@ def test_multi_level_tiling_conv2d_nchwc_vnni(): l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True) sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77) b98 = sch.cache_write(block=b27, write_buffer_index=0, storage_scope="global") -sch.reverse_compute_at(block=b98, loop=l74, preserve_unit_loops=True)""".split( +sch.reverse_compute_at(block=b98, loop=l74, preserve_unit_loops=True, index=-1)""".split( "\n" ), """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main") @@ -546,15 +546,15 @@ def test_multi_level_tiling_dense_dp4a(): l38 = sch.fuse(l17, l27, preserve_unit_iters=True) sch.bind(loop=l38, thread_axis="threadIdx.x") b39 = sch.cache_write(block=b6, write_buffer_index=0, storage_scope="local") -sch.reverse_compute_at(block=b39, loop=l38, preserve_unit_loops=True) +sch.reverse_compute_at(block=b39, loop=l38, preserve_unit_loops=True, index=-1) b40 = sch.cache_read(block=b6, read_buffer_index=0, storage_scope="shared") -sch.compute_at(block=b40, loop=l33, preserve_unit_loops=True) +sch.compute_at(block=b40, loop=l33, preserve_unit_loops=True, index=-1) l41, l42, l43, l44, l45, l46 = sch.get_loops(block=b40) l47 = sch.fuse(l45, l46, preserve_unit_iters=True) v48 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25]) sch.annotate(block_or_loop=b40, ann_key="meta_schedule.cooperative_fetch", ann_val=v48) b49 = sch.cache_read(block=b6, read_buffer_index=1, storage_scope="shared") -sch.compute_at(block=b49, loop=l33, preserve_unit_loops=True) +sch.compute_at(block=b49, loop=l33, preserve_unit_loops=True, index=-1) l50, l51, l52, l53, l54, l55 = sch.get_loops(block=b49) l56 = sch.fuse(l54, l55, preserve_unit_iters=True) v57 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25]) @@ -632,9 +632,9 @@ def test_cuda_tensor_core_matmul_relu(): l52 = sch.fuse(l31, l41, preserve_unit_iters=True) sch.bind(loop=l52, thread_axis="threadIdx.y") b53 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="shared") -sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True) +sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True, index=-1) b54 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="wmma.accumulator") -sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True) +sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True, index=-1) v55 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25]) sch.annotate(block_or_loop=b53, ann_key="meta_schedule.cooperative_fetch", ann_val=v55) sch.reverse_compute_inline(block=b2) @@ -646,19 +646,19 @@ def test_cuda_tensor_core_matmul_relu(): b72 = sch.blockize(loop=l64) sch.annotate(block_or_loop=b72, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared") b73 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="shared") -sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True) +sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True, index=-1) l74, l75, l76, l77, l78, l79 = sch.get_loops(block=b73) l80 = sch.fuse(l78, l79, preserve_unit_iters=True) v81 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v81) b82 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="shared") -sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True) +sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True, index=-1) l83, l84, l85, l86, l87, l88 = sch.get_loops(block=b82) l89 = sch.fuse(l87, l88, preserve_unit_iters=True) v90 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) sch.annotate(block_or_loop=b82, ann_key="meta_schedule.cooperative_fetch", ann_val=v90) b91 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="wmma.matrix_a") -sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True) +sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True, index=-1) l92, l93, l94, l95, l96, l97, l98 = sch.get_loops(block=b91) l99, l100 = sch.split(loop=l98, factors=[None, 16], preserve_unit_iters=True) l101, l102 = sch.split(loop=l97, factors=[None, 16], preserve_unit_iters=True) @@ -667,7 +667,7 @@ def test_cuda_tensor_core_matmul_relu(): b112 = sch.blockize(loop=l102) sch.annotate(block_or_loop=b112, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a") b113 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="wmma.matrix_b") -sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True) +sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True, index=-1) l114, l115, l116, l117, l118, l119, l120 = sch.get_loops(block=b113) l121, l122 = sch.split(loop=l120, factors=[None, 16], preserve_unit_iters=True) l123, l124 = sch.split(loop=l119, factors=[None, 16], preserve_unit_iters=True) @@ -772,9 +772,9 @@ def test_cuda_tensor_core_software_pipeline_matmul_relu(): l52 = sch.fuse(l31, l41, preserve_unit_iters=True) sch.bind(loop=l52, thread_axis="threadIdx.y") b53 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="shared") -sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True) +sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True, index=-1) b54 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="wmma.accumulator") -sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True) +sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True, index=-1) v55 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25]) sch.annotate(block_or_loop=b53, ann_key="meta_schedule.cooperative_fetch", ann_val=v55) sch.reverse_compute_inline(block=b2) @@ -786,19 +786,19 @@ def test_cuda_tensor_core_software_pipeline_matmul_relu(): b72 = sch.blockize(loop=l64) sch.annotate(block_or_loop=b72, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared") b73 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="shared") -sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True) +sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True, index=-1) l74, l75, l76, l77, l78, l79 = sch.get_loops(block=b73) l80 = sch.fuse(l78, l79, preserve_unit_iters=True) v81 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v81) b82 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="shared") -sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True) +sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True, index=-1) l83, l84, l85, l86, l87, l88 = sch.get_loops(block=b82) l89 = sch.fuse(l87, l88, preserve_unit_iters=True) v90 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) sch.annotate(block_or_loop=b82, ann_key="meta_schedule.cooperative_fetch", ann_val=v90) b91 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="wmma.matrix_a") -sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True) +sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True, index=-1) l92, l93, l94, l95, l96, l97, l98 = sch.get_loops(block=b91) l99, l100 = sch.split(loop=l98, factors=[None, 16], preserve_unit_iters=True) l101, l102 = sch.split(loop=l97, factors=[None, 16], preserve_unit_iters=True) @@ -807,7 +807,7 @@ def test_cuda_tensor_core_software_pipeline_matmul_relu(): b112 = sch.blockize(loop=l102) sch.annotate(block_or_loop=b112, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a") b113 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="wmma.matrix_b") -sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True) +sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True, index=-1) l114, l115, l116, l117, l118, l119, l120 = sch.get_loops(block=b113) l121, l122 = sch.split(loop=l120, factors=[None, 16], preserve_unit_iters=True) l123, l124 = sch.split(loop=l119, factors=[None, 16], preserve_unit_iters=True) @@ -895,7 +895,7 @@ def test_cuda_tensor_core_matmul_relu_global(): l51 = sch.fuse(l30, l40, preserve_unit_iters=True) sch.bind(loop=l51, thread_axis="threadIdx.y") b52 = sch.cache_write(block=b19, write_buffer_index=0, storage_scope="wmma.accumulator") -sch.reverse_compute_at(block=b52, loop=l51, preserve_unit_loops=True) +sch.reverse_compute_at(block=b52, loop=l51, preserve_unit_loops=True, index=-1) sch.reverse_compute_inline(block=b1) l53, l54, l55, l56, l57 = sch.get_loops(block=b52) l58, l59 = sch.split(loop=l57, factors=[None, 16], preserve_unit_iters=True) @@ -905,19 +905,19 @@ def test_cuda_tensor_core_matmul_relu_global(): b69 = sch.blockize(loop=l61) sch.annotate(block_or_loop=b69, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_global") b70 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="shared") -sch.compute_at(block=b70, loop=l46, preserve_unit_loops=True) +sch.compute_at(block=b70, loop=l46, preserve_unit_loops=True, index=-1) l71, l72, l73, l74, l75, l76 = sch.get_loops(block=b70) l77 = sch.fuse(l75, l76, preserve_unit_iters=True) v78 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) sch.annotate(block_or_loop=b70, ann_key="meta_schedule.cooperative_fetch", ann_val=v78) b79 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="shared") -sch.compute_at(block=b79, loop=l46, preserve_unit_loops=True) +sch.compute_at(block=b79, loop=l46, preserve_unit_loops=True, index=-1) l80, l81, l82, l83, l84, l85 = sch.get_loops(block=b79) l86 = sch.fuse(l84, l85, preserve_unit_iters=True) v87 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) sch.annotate(block_or_loop=b79, ann_key="meta_schedule.cooperative_fetch", ann_val=v87) b88 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="wmma.matrix_a") -sch.compute_at(block=b88, loop=l47, preserve_unit_loops=True) +sch.compute_at(block=b88, loop=l47, preserve_unit_loops=True, index=-1) l89, l90, l91, l92, l93, l94, l95 = sch.get_loops(block=b88) l96, l97 = sch.split(loop=l95, factors=[None, 16], preserve_unit_iters=True) l98, l99 = sch.split(loop=l94, factors=[None, 16], preserve_unit_iters=True) @@ -926,7 +926,7 @@ def test_cuda_tensor_core_matmul_relu_global(): b109 = sch.blockize(loop=l99) sch.annotate(block_or_loop=b109, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a") b110 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="wmma.matrix_b") -sch.compute_at(block=b110, loop=l47, preserve_unit_loops=True) +sch.compute_at(block=b110, loop=l47, preserve_unit_loops=True, index=-1) l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b110) l118, l119 = sch.split(loop=l117, factors=[None, 16], preserve_unit_iters=True) l120, l121 = sch.split(loop=l116, factors=[None, 16], preserve_unit_iters=True) @@ -995,7 +995,7 @@ def test_cuda_tensor_core_matmul_relu_global(): l51 = sch.fuse(l30, l40, preserve_unit_iters=True) sch.bind(loop=l51, thread_axis="threadIdx.y") b52 = sch.cache_write(block=b19, write_buffer_index=0, storage_scope="wmma.accumulator") -sch.reverse_compute_at(block=b52, loop=l51, preserve_unit_loops=True) +sch.reverse_compute_at(block=b52, loop=l51, preserve_unit_loops=True, index=-1) sch.reverse_compute_inline(block=b1) l53, l54, l55, l56, l57 = sch.get_loops(block=b52) l58, l59 = sch.split(loop=l57, factors=[None, 16], preserve_unit_iters=True) @@ -1005,19 +1005,19 @@ def test_cuda_tensor_core_matmul_relu_global(): b69 = sch.blockize(loop=l61) sch.annotate(block_or_loop=b69, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_global") b70 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="shared") -sch.compute_at(block=b70, loop=l46, preserve_unit_loops=True) +sch.compute_at(block=b70, loop=l46, preserve_unit_loops=True, index=-1) l71, l72, l73, l74, l75, l76 = sch.get_loops(block=b70) l77 = sch.fuse(l75, l76, preserve_unit_iters=True) v78 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) sch.annotate(block_or_loop=b70, ann_key="meta_schedule.cooperative_fetch", ann_val=v78) b79 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="shared") -sch.compute_at(block=b79, loop=l46, preserve_unit_loops=True) +sch.compute_at(block=b79, loop=l46, preserve_unit_loops=True, index=-1) l80, l81, l82, l83, l84, l85 = sch.get_loops(block=b79) l86 = sch.fuse(l84, l85, preserve_unit_iters=True) v87 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) sch.annotate(block_or_loop=b79, ann_key="meta_schedule.cooperative_fetch", ann_val=v87) b88 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="wmma.matrix_a") -sch.compute_at(block=b88, loop=l47, preserve_unit_loops=True) +sch.compute_at(block=b88, loop=l47, preserve_unit_loops=True, index=-1) l89, l90, l91, l92, l93, l94, l95 = sch.get_loops(block=b88) l96, l97 = sch.split(loop=l95, factors=[None, 16], preserve_unit_iters=True) l98, l99 = sch.split(loop=l94, factors=[None, 16], preserve_unit_iters=True) @@ -1026,7 +1026,7 @@ def test_cuda_tensor_core_matmul_relu_global(): b109 = sch.blockize(loop=l99) sch.annotate(block_or_loop=b109, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a") b110 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="wmma.matrix_b") -sch.compute_at(block=b110, loop=l47, preserve_unit_loops=True) +sch.compute_at(block=b110, loop=l47, preserve_unit_loops=True, index=-1) l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b110) l118, l119 = sch.split(loop=l117, factors=[None, 16], preserve_unit_iters=True) l120, l121 = sch.split(loop=l116, factors=[None, 16], preserve_unit_iters=True) @@ -1133,9 +1133,9 @@ def test_cuda_tensor_core_conv2d(): l64 = sch.fuse(l33, l43, l53, preserve_unit_iters=True) sch.bind(loop=l64, thread_axis="threadIdx.y") b65 = sch.cache_write(block=b21, write_buffer_index=0, storage_scope="shared") -sch.reverse_compute_at(block=b65, loop=l63, preserve_unit_loops=True) +sch.reverse_compute_at(block=b65, loop=l63, preserve_unit_loops=True, index=-1) b66 = sch.cache_write(block=b21, write_buffer_index=0, storage_scope="wmma.accumulator") -sch.reverse_compute_at(block=b66, loop=l64, preserve_unit_loops=True) +sch.reverse_compute_at(block=b66, loop=l64, preserve_unit_loops=True, index=-1) v67 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25]) sch.annotate(block_or_loop=b65, ann_key="meta_schedule.cooperative_fetch", ann_val=v67) sch.reverse_compute_inline(block=b1) @@ -1147,19 +1147,19 @@ def test_cuda_tensor_core_conv2d(): b84 = sch.blockize(loop=l76) sch.annotate(block_or_loop=b84, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared") b85 = sch.cache_read(block=b21, read_buffer_index=0, storage_scope="shared") -sch.compute_at(block=b85, loop=l59, preserve_unit_loops=True) +sch.compute_at(block=b85, loop=l59, preserve_unit_loops=True, index=-1) l86, l87, l88, l89, l90, l91 = sch.get_loops(block=b85) l92 = sch.fuse(l90, l91, preserve_unit_iters=True) v93 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) sch.annotate(block_or_loop=b85, ann_key="meta_schedule.cooperative_fetch", ann_val=v93) b94 = sch.cache_read(block=b21, read_buffer_index=1, storage_scope="shared") -sch.compute_at(block=b94, loop=l59, preserve_unit_loops=True) +sch.compute_at(block=b94, loop=l59, preserve_unit_loops=True, index=-1) l95, l96, l97, l98, l99, l100 = sch.get_loops(block=b94) l101 = sch.fuse(l99, l100, preserve_unit_iters=True) v102 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) sch.annotate(block_or_loop=b94, ann_key="meta_schedule.cooperative_fetch", ann_val=v102) b103 = sch.cache_read(block=b21, read_buffer_index=0, storage_scope="wmma.matrix_a") -sch.compute_at(block=b103, loop=l60, preserve_unit_loops=True) +sch.compute_at(block=b103, loop=l60, preserve_unit_loops=True, index=-1) l104, l105, l106, l107, l108, l109, l110 = sch.get_loops(block=b103) l111, l112 = sch.split(loop=l110, factors=[None, 16], preserve_unit_iters=True) l113, l114 = sch.split(loop=l109, factors=[None, 16], preserve_unit_iters=True) @@ -1168,7 +1168,7 @@ def test_cuda_tensor_core_conv2d(): b124 = sch.blockize(loop=l114) sch.annotate(block_or_loop=b124, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a") b125 = sch.cache_read(block=b21, read_buffer_index=1, storage_scope="wmma.matrix_b") -sch.compute_at(block=b125, loop=l60, preserve_unit_loops=True) +sch.compute_at(block=b125, loop=l60, preserve_unit_loops=True, index=-1) l126, l127, l128, l129, l130, l131, l132 = sch.get_loops(block=b125) l133, l134 = sch.split(loop=l132, factors=[None, 16], preserve_unit_iters=True) l135, l136 = sch.split(loop=l131, factors=[None, 16], preserve_unit_iters=True) diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py index b2df408e9d01..c951a5adf386 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py @@ -71,7 +71,7 @@ def test_random_compute_location(): [ 'b0 = sch.get_block(name="move", func_name="main")', "l1 = sch.sample_compute_location(block=b0)", - "sch.compute_at(block=b0, loop=l1, preserve_unit_loops=True)", + "sch.compute_at(block=b0, loop=l1, preserve_unit_loops=True, index=-1)", ] ] mod = Add diff --git a/tests/python/unittest/test_tir_schedule_compute_at.py b/tests/python/unittest/test_tir_schedule_compute_at.py index 0c20a4783ca0..72cba1a8fdc4 100644 --- a/tests/python/unittest/test_tir_schedule_compute_at.py +++ b/tests/python/unittest/test_tir_schedule_compute_at.py @@ -1353,5 +1353,157 @@ def _create_prim_func(): verify_trace_roundtrip(sch=sch, mod=mod) +def test_compute_at_to_index(): + @T.prim_func + def multi_producers_conv( + data: T.Buffer[(1, 3, 224, 224), "int8"], + w: T.Buffer[(16, 3, 7, 7), "int8"], + conv: T.Buffer[(1, 16, 112, 112), "int32"], + ) -> None: + pad = T.alloc_buffer([1, 3, 230, 230], dtype="int8") + wbuf = T.alloc_buffer([16, 3, 7, 7], dtype="int8") + for i0, i1, i2, i3 in T.grid(1, 3, 230, 230): + with T.block("pad"): + i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) + T.reads(data[i0_1, i1_1, i2_1 - 3, i3_1 - 3]) + T.writes(pad[i0_1, i1_1, i2_1, i3_1]) + pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else( + 3 <= i2_1 and i2_1 < 227 and 3 <= i3_1 and i3_1 < 227, + data[i0_1, i1_1, i2_1 - 3, i3_1 - 3], + T.int8(0), + dtype="int8", + ) + for i0 in T.serial(1): + for ax0, ax1, ax2, ax3 in T.grid(16, 3, 7, 7): + with T.block("wbuf"): + v0, v1, v2, v3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3]) + T.reads(w[v0, v1, v2, v3]) + T.writes(wbuf[v0, v1, v2, v3]) + wbuf[v0, v1, v2, v3] = w[v0, v1, v2, v3] + for i1, i2, i3, i4, i5, i6 in T.grid(16, 112, 112, 3, 7, 7): + with T.block("conv"): + nn, ff, yy, xx, rc, ry, rx = T.axis.remap( + "SSSSRRR", [i0, i1, i2, i3, i4, i5, i6] + ) + T.reads(pad[nn, rc, yy * 2 + ry, xx * 2 + rx], wbuf[ff, rc, ry, rx]) + T.writes(conv[nn, ff, yy, xx]) + with T.init(): + conv[nn, ff, yy, xx] = 0 + conv[nn, ff, yy, xx] = conv[nn, ff, yy, xx] + T.cast( + pad[nn, rc, yy * 2 + ry, xx * 2 + rx], "int32" + ) * T.cast(wbuf[ff, rc, ry, rx], "int32") + + @T.prim_func + def multi_producers_after_compute_at( + data: T.Buffer[(1, 3, 224, 224), "int8"], + w: T.Buffer[(16, 3, 7, 7), "int8"], + conv: T.Buffer[(1, 16, 112, 112), "int32"], + ) -> None: + pad = T.alloc_buffer([1, 3, 230, 230], dtype="int8") + wbuf = T.alloc_buffer([16, 3, 7, 7], dtype="int8") + for i0 in T.serial(1): + for ax0, ax1, ax2 in T.grid(3, 229, 229): + with T.block("pad"): + i0_1 = T.axis.spatial(1, 0) + i1_1 = T.axis.spatial(3, ax0) + i2_1 = T.axis.spatial(230, ax1) + i3_1 = T.axis.spatial(230, ax2) + T.reads(data[i0_1, i1_1, i2_1 - 3, i3_1 - 3]) + T.writes(pad[i0_1, i1_1, i2_1, i3_1]) + pad[i0_1, i1_1, i2_1, i3_1] = T.if_then_else( + 3 <= i2_1 and i2_1 < 227 and 3 <= i3_1 and i3_1 < 227, + data[i0_1, i1_1, i2_1 - 3, i3_1 - 3], + T.int8(0), + dtype="int8", + ) + for ax0, ax1, ax2, ax3 in T.grid(16, 3, 7, 7): + with T.block("wbuf"): + v0, v1, v2, v3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3]) + T.reads(w[v0, v1, v2, v3]) + T.writes(wbuf[v0, v1, v2, v3]) + wbuf[v0, v1, v2, v3] = w[v0, v1, v2, v3] + for i1, i2, i3, i4, i5, i6 in T.grid(16, 112, 112, 3, 7, 7): + with T.block("conv"): + nn, ff, yy, xx, rc, ry, rx = T.axis.remap( + "SSSSRRR", [i0, i1, i2, i3, i4, i5, i6] + ) + T.reads(pad[nn, rc, yy * 2 + ry, xx * 2 + rx], wbuf[ff, rc, ry, rx]) + T.writes(conv[nn, ff, yy, xx]) + with T.init(): + conv[nn, ff, yy, xx] = 0 + conv[nn, ff, yy, xx] = conv[nn, ff, yy, xx] + T.cast( + pad[nn, rc, yy * 2 + ry, xx * 2 + rx], "int32" + ) * T.cast(wbuf[ff, rc, ry, rx], "int32") + + sch = tir.Schedule(multi_producers_conv, debug_mask="all") + block_c = sch.get_block("pad") + axis = sch.get_loops("conv")[0] + sch.compute_at(block_c, axis, index=-2) + tvm.ir.assert_structural_equal(multi_producers_after_compute_at, sch.mod["main"]) + + +def test_reverse_compute_at_to_index(): + @T.prim_func + def main(A: T.Buffer[(128, 128), "float32"], D: T.Buffer[(128, 128), "float32"]) -> None: + B = T.alloc_buffer([128, 128], dtype="float32") + C = T.alloc_buffer([128, 128], dtype="float32") + for i_0, j_0, i_1 in T.grid(8, 8, 16): + for j_1 in T.serial(16): + with T.block("B"): + vi = T.axis.spatial(128, i_0 * 16 + i_1) + vj = T.axis.spatial(128, j_0 * 16 + j_1) + T.reads(A[vi, vj]) + T.writes(B[vi, vj]) + B[vi, vj] = A[vi, vj] * T.float32(2) + for ax0 in T.serial(16): + with T.block("C"): + vi = T.axis.spatial(128, i_0 * 16 + i_1) + vj = T.axis.spatial(128, j_0 * 16 + ax0) + T.reads(B[vi, vj]) + T.writes(C[vi, vj]) + C[vi, vj] = B[vi, vj] + T.float32(1) + for i, j in T.grid(128, 128): + with T.block("D"): + vi, vj = T.axis.remap("SS", [i, j]) + T.reads(B[vi, vj]) + T.writes(D[vi, vj]) + D[vi, vj] = B[vi, vj] + T.float32(1) + + @T.prim_func + def main_reverse_compute_at( + A: T.Buffer[(128, 128), "float32"], D: T.Buffer[(128, 128), "float32"] + ) -> None: + B = T.alloc_buffer([128, 128], dtype="float32") + C = T.alloc_buffer([128, 128], dtype="float32") + for i_0, j_0, i_1 in T.grid(8, 8, 16): + for j_1 in T.serial(16): + with T.block("B"): + vi = T.axis.spatial(128, i_0 * 16 + i_1) + vj = T.axis.spatial(128, j_0 * 16 + j_1) + T.reads(A[vi, vj]) + T.writes(B[vi, vj]) + B[vi, vj] = A[vi, vj] * T.float32(2) + for ax0 in T.serial(16): + with T.block("D"): + vi = T.axis.spatial(128, i_0 * 16 + i_1) + vj = T.axis.spatial(128, j_0 * 16 + ax0) + T.reads(B[vi, vj]) + T.writes(D[vi, vj]) + D[vi, vj] = B[vi, vj] + T.float32(1) + for ax0 in T.serial(16): + with T.block("C"): + vi = T.axis.spatial(128, i_0 * 16 + i_1) + vj = T.axis.spatial(128, j_0 * 16 + ax0) + T.reads(B[vi, vj]) + T.writes(C[vi, vj]) + C[vi, vj] = B[vi, vj] + T.float32(1) + + sch = tir.Schedule(main, debug_mask="all") + block_c = sch.get_block("D") + axis = sch.get_loops("B")[2] + sch.reverse_compute_at(block_c, axis, index=1) + tvm.ir.assert_structural_equal(main_reverse_compute_at, sch.mod["main"]) + + if __name__ == "__main__": tvm.testing.main() From d171b4af09b89683f8648a9df4a1d5cb5902bd99 Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Fri, 26 Aug 2022 10:28:20 -0600 Subject: [PATCH 055/704] [SimplifyExpr] Add simplify for dq->arg funcs (#12580) * add simplify for dq->arg funcs * add comments, fix lint * move comments to the right spots --- src/relay/transforms/simplify_expr.cc | 48 +++++++++++++++++++ tests/python/relay/test_pass_simplify_expr.py | 48 +++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc index a6751933a88c..463f76995436 100644 --- a/src/relay/transforms/simplify_expr.cc +++ b/src/relay/transforms/simplify_expr.cc @@ -685,6 +685,7 @@ class SimplifyConsecutiveAdd : public DFPatternRewrite { DFPattern const2_; }; +/*! \brief Simplifying x/sqrt to x*sqrt */ class SimplifyRSqrt : public DFPatternRewrite { public: SimplifyRSqrt() { @@ -708,6 +709,50 @@ class SimplifyRSqrt : public DFPatternRewrite { DFPattern numerator_; }; +/*! \brief Base class for simplifying dequantize followed by arg ops */ +class SimplifyDQArgFunc : public DFPatternRewrite { + public: + explicit SimplifyDQArgFunc(std::string op) : op_(op) { + x_ = IsWildcard(); + dq_ = IsOp("qnn.dequantize")({x_, IsWildcard(), IsWildcard()}); + pattern_ = IsOp(op_)({dq_}); + } + + Expr Callback(const Expr& pre, const Expr& post, + const Map>& node_map) const override { + const CallNode* call = pre.as(); + ICHECK(call); + auto x = node_map[x_][0]; + return Call(Op::Get(op_), {x}, call->attrs); + } + + protected: + /*! \brief Pattern input */ + DFPattern x_; + /*! \brief dequantize op */ + DFPattern dq_; + /*! \brief Name of op to simplify */ + String op_; +}; + +/*! \brief Simplify dequantize follwed by argmax */ +class SimplifyDQArgMax : public SimplifyDQArgFunc { + public: + SimplifyDQArgMax() : SimplifyDQArgFunc("argmax") {} +}; + +/*! \brief Simplify dequantize follwed by argmin */ +class SimplifyDQArgMin : public SimplifyDQArgFunc { + public: + SimplifyDQArgMin() : SimplifyDQArgFunc("argmin") {} +}; + +/*! \brief Simplify dequantize follwed by argsort */ +class SimplifyDQArgSort : public SimplifyDQArgFunc { + public: + SimplifyDQArgSort() : SimplifyDQArgFunc("argsort") {} +}; + Expr SimplifyExpr(const Expr& expr, const IRModule& mod) { // the rewrites will be applied in the given order, and repeated until fixed point DFPatternRewriteComposer composer; @@ -725,6 +770,9 @@ Expr SimplifyExpr(const Expr& expr, const IRModule& mod) { composer.AddRewrite(); composer.AddRewrite(); composer.AddRewrite(); + composer.AddRewrite(); + composer.AddRewrite(); + composer.AddRewrite(); return RewritePatterns(composer.MakeCallbacks(), expr, mod); } diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py index 837b15a48dc1..dcd58602b0ac 100644 --- a/tests/python/relay/test_pass_simplify_expr.py +++ b/tests/python/relay/test_pass_simplify_expr.py @@ -603,5 +603,53 @@ def expected(c): assert tvm.ir.structural_equal(opt, after) +def test_simplify_dq_argmax(): + shape = (4, 32, 1, 1) + x = relay.var("x", shape=shape, dtype="int8") + + def before(): + y = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(0)) + return relay.op.argmax(y, axis=1) + + def expected(): + return relay.op.argmax(x, axis=1) + + opt = run_opt_pass(before(), transform.SimplifyExpr()) + after = run_opt_pass(expected(), transform.InferType()) + assert tvm.ir.structural_equal(opt, after) + + +def test_simplify_dq_argmin(): + shape = (4, 32, 1, 1) + x = relay.var("x", shape=shape, dtype="int8") + + def before(): + y = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(0)) + return relay.op.argmin(y, axis=1) + + def expected(): + return relay.op.argmin(x, axis=1) + + opt = run_opt_pass(before(), transform.SimplifyExpr()) + after = run_opt_pass(expected(), transform.InferType()) + assert tvm.ir.structural_equal(opt, after) + + +def test_simplify_dq_argsort(): + shape = (4, 32, 1, 1) + x = relay.var("x", shape=shape, dtype="int8") + + def before(): + y = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(0)) + return relay.op.argsort(y, axis=1) + + def expected(): + return relay.op.argsort(x, axis=1) + + opt = run_opt_pass(before(), transform.SimplifyExpr()) + after = run_opt_pass(expected(), transform.InferType()) + assert tvm.ir.structural_equal(opt, after) + + if __name__ == "__main__": pytest.main([__file__]) From d87fa854b8eb0c8f603d8dc459121eaa1a365e12 Mon Sep 17 00:00:00 2001 From: masahi Date: Sat, 27 Aug 2022 02:01:24 +0900 Subject: [PATCH 056/704] [Hexagon] Initial support for meta schedule tuning (#12587) Enables AutoTVM-style, template-based tuning for Hexagon. To run compiled code on Hexagon, we need to use Hexagon `Session` object https://github.com/apache/tvm/blob/dc522a6ff65b68532cd1bba43827cd981114df2c/python/tvm/contrib/hexagon/session.py#L35 in the metaschedule `RPCRunner`. But for RPC "session", `RPCRunner` expects an instance of `RPCSession`, https://github.com/apache/tvm/blob/53fe5966823eee4e011d7228bceab3c82c1d9caa/python/tvm/rpc/client.py#L32, to be created and used by various customizable functions. Since `RPCSession` and Hexagon `Session` have slightly different API, we cannot use `RPCRunner` with customizable functions directly. So I introduced an alternative implementation of `RPCRunner` for Hexagon. The test is disabled for simulator since `HexagonLauncherSimulator` is not pickle-able due to its `multiprocessing.Process` attribute: https://github.com/apache/tvm/blob/c97895e0ffb512e73c89de7cdee9846f052244fc/python/tvm/contrib/hexagon/build.py#L614 Output log from tuning `vrmpy` dense (included in the test) ``` ID | Name | FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Terminated -------------------------------------------------------------------------------------------------------------- 0 | main | 150994944 | 1 | 380.3399 | 397.0000 | 397.0000 | 32 | -------------------------------------------------------------------------------------------------------------- ``` --- apps/hexagon_api/CMakeLists.txt | 2 + python/tvm/contrib/hexagon/meta_schedule.py | 166 ++++++++++++++ python/tvm/contrib/hexagon/session.py | 8 +- python/tvm/contrib/hexagon/tools.py | 7 + python/tvm/meta_schedule/default_config.py | 6 +- python/tvm/target/target.py | 5 + python/tvm/tir/tensor_intrin/__init__.py | 2 +- python/tvm/tir/tensor_intrin/hexagon.py | 71 ++++++ src/target/target_kind.cc | 1 + .../test_hexagon/test_meta_schedule.py | 211 ++++++++++++++++++ 10 files changed, 472 insertions(+), 7 deletions(-) create mode 100644 python/tvm/contrib/hexagon/meta_schedule.py create mode 100644 python/tvm/tir/tensor_intrin/hexagon.py create mode 100644 tests/python/contrib/test_hexagon/test_meta_schedule.py diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt index aa971c875307..9a05cf3675b6 100644 --- a/apps/hexagon_api/CMakeLists.txt +++ b/apps/hexagon_api/CMakeLists.txt @@ -87,6 +87,7 @@ ExternalProject_Add(android_tvm_runtime_rpc "-DUSE_HEXAGON_RPC=ON" "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" "-DUSE_ALTERNATIVE_LINKER=OFF" + "-DUSE_RANDOM=ON" INSTALL_COMMAND "" BUILD_ALWAYS ON ) @@ -133,6 +134,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc "-DUSE_ALTERNATIVE_LINKER=OFF" "-DUSE_CUSTOM_LOGGING=ON" "-DUSE_HEXAGON_QHL=ON" + "-DUSE_RANDOM=ON" "${GTEST_FLAG}" INSTALL_COMMAND "" BUILD_ALWAYS ON diff --git a/python/tvm/contrib/hexagon/meta_schedule.py b/python/tvm/contrib/hexagon/meta_schedule.py new file mode 100644 index 000000000000..8a4de74b6131 --- /dev/null +++ b/python/tvm/contrib/hexagon/meta_schedule.py @@ -0,0 +1,166 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Meta schedule tuning utilities for Hexagon.""" +import os +import tempfile +from typing import Callable, List, Optional +from tvm.contrib.popen_pool import PopenPoolExecutor +from tvm.meta_schedule.utils import cpu_count, derived_object +from tvm.meta_schedule.builder import LocalBuilder +from tvm.meta_schedule.runner import ( + EvaluatorConfig, + RunnerInput, + RunnerFuture, + PyRunner, +) +from tvm.meta_schedule.runner.rpc_runner import ( + default_alloc_argument, + default_run_evaluator, + RPCRunnerFuture, +) + +from .build import HexagonLauncherRPC +from .tools import export_module + + +@derived_object +class HexagonRPCRunner(PyRunner): + """RPCRunner for Hexagon. See the documentation of RPCRunner for more details.""" + + def __init__( + self, + hexagon_launcher: HexagonLauncherRPC, + evaluator_config: Optional[EvaluatorConfig] = None, + cooldown_sec: float = 0.0, + alloc_repeat: int = 1, + max_workers: Optional[int] = None, + initializer: Optional[Callable[[], None]] = None, + ): + """ + Parameters + ---------- + hexagon_launcher : HexagonLauncherRPC + The RPC launcher for Hexagon. It is needed for creating hexagon.Session + object inside the worker function. + evaluator_config: EvaluatorConfig + The evaluator configuration. + cooldown_sec: float + The cooldown in seconds. + alloc_repeat: int + The number of times to random fill the allocation. + max_workers: Optional[int] = None + The maximum number of connections. Defaults to number of logical CPU cores. + initializer: Optional[Callable[[], None]] + The initializer function. + """ + + super().__init__() + self.hexagon_launcher = hexagon_launcher + self.evaluator_config = EvaluatorConfig._normalized(evaluator_config) + self.cooldown_sec = cooldown_sec + self.alloc_repeat = alloc_repeat + if max_workers is None: + max_workers = cpu_count(logical=True) + self.pool = PopenPoolExecutor( + max_workers=max_workers, + timeout=100, + initializer=initializer, + ) + + def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]: + results = [] + for runner_input in runner_inputs: + future = RPCRunnerFuture( + future=self.pool.submit( + _worker_func, + self.hexagon_launcher, + self.evaluator_config, + self.alloc_repeat, + str(runner_input.artifact_path), + tuple(arg_info.as_json() for arg_info in runner_input.args_info), + ), + timeout_sec=100, + ) + results.append(future) + return results + + +def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat, artifact_path, args_info): + with hexagon_launcher.start_session() as session: + device = session.device + _, remote_path = os.path.split(artifact_path) + uploaded = session.upload(artifact_path, remote_path) + rt_mod = session.load_module(uploaded) + repeated_args = default_alloc_argument( + session, + device, + args_info, + alloc_repeat, + ) + costs = default_run_evaluator( + session, + rt_mod, + device, + evaluator_config, + repeated_args, + ) + return costs + + +def get_hexagon_local_builder(): + """Return Hexagon-compatible Builder for meta schedule.""" + + def export_func(mod): + binary_path = export_module(mod, tempfile.mkdtemp()) + return str(binary_path) + + return LocalBuilder(f_export=export_func) + + +def get_hexagon_rpc_runner( + hexagon_launcher: HexagonLauncherRPC, number=3, repeat=1, min_repeat_ms=100 +): + """Return Hexagon-compatible RPC Runner for meta schedule. + + Parameters + ---------- + hexagon_launcher : HexagonLauncherRPC + The RPC launcher for Hexagon. + number: int + The number of times to run this function for taking average. + We call these runs as one `repeat` of measurement. + repeat: int + The number of times to repeat the measurement. + In total, the function will be invoked (1 + number x repeat) times, + where the first one is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. + min_repeat_ms: int + Minimum repeat time in ms. if the execution latency is too short, + increase the number of runs to the given time (in ms) to reduce the measurement error. + """ + evaluator_config = EvaluatorConfig( + number=number, + repeat=repeat, + min_repeat_ms=min_repeat_ms, + enable_cpu_cache_flush=False, + ) + + return HexagonRPCRunner( + hexagon_launcher, + evaluator_config, + ) diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py index 0c0bf296df44..9308e396b2a5 100644 --- a/python/tvm/contrib/hexagon/session.py +++ b/python/tvm/contrib/hexagon/session.py @@ -30,6 +30,7 @@ AOTExecutorFactoryModule, GraphExecutorFactoryModule, ) +from .tools import export_module class Session: @@ -110,6 +111,9 @@ def device(self): return self._device + def get_function(self, name): + return self._rpc.get_function(name) + def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str) -> pathlib.Path: """Upload a local file to the remote workspace. @@ -154,10 +158,8 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]): if isinstance(module, tvm.runtime.Module): with tempfile.TemporaryDirectory() as temp_dir: - temp_dir = pathlib.Path(temp_dir) binary_name = "test_binary.so" - binary_path = temp_dir / binary_name - module.save(str(binary_path)) + binary_path = export_module(module, temp_dir, binary_name) remote_file_path = self.upload(binary_path, binary_name) else: remote_file_path = module diff --git a/python/tvm/contrib/hexagon/tools.py b/python/tvm/contrib/hexagon/tools.py index 1aec8c7d565b..3f4adb90f645 100644 --- a/python/tvm/contrib/hexagon/tools.py +++ b/python/tvm/contrib/hexagon/tools.py @@ -194,3 +194,10 @@ def create_aot_shared(so_name: Union[str, pathlib.Path], files, hexagon_arch: st cross_compile.output_format = "o" c_files = [str(file) for file in files] cross_compile(str(so_name), c_files, options=compile_options + options) + + +def export_module(module, out_dir, binary_name="test_binary.so"): + """Export Hexagon shared object to a file.""" + binary_path = pathlib.Path(out_dir) / binary_name + module.save(str(binary_path)) + return binary_path diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py index 0f1f7d3c2c6a..97cbfc58a6c1 100644 --- a/python/tvm/meta_schedule/default_config.py +++ b/python/tvm/meta_schedule/default_config.py @@ -178,7 +178,7 @@ def schedule_rules( # pylint: disable=redefined-outer-name return sch_rules() if sch_rules is not None: raise TypeError(f"Expected `sch_rules` to be None or callable, but gets: {sch_rules}") - if target.kind.name == "llvm": + if target.kind.name in ["llvm", "hexagon"]: return _DefaultLLVM.schedule_rules() if target.kind.name in ["cuda", "rocm", "vulkan"]: return _DefaultCUDA.schedule_rules() @@ -194,7 +194,7 @@ def postproc( # pylint: disable=redefined-outer-name return postproc() if postproc is not None: raise TypeError(f"Expected `postproc` to be None or callable, but gets: {postproc}") - if target.kind.name == "llvm": + if target.kind.name in ["llvm", "hexagon"]: return _DefaultLLVM.postprocs() if target.kind.name in ["cuda", "rocm", "vulkan"]: return _DefaultCUDA.postprocs() @@ -212,7 +212,7 @@ def mutator_probs( # pylint: disable=redefined-outer-name raise TypeError( f"Expected `mutator_probs` to be None or callable, but gets: {mutator_probs}" ) - if target.kind.name == "llvm": + if target.kind.name in ["llvm", "hexagon"]: return _DefaultLLVM.mutator_probs() if target.kind.name in ["cuda", "rocm", "vulkan"]: return _DefaultCUDA.mutator_probs() diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py index a558fcbeaf5b..1e9e2e698c44 100644 --- a/python/tvm/target/target.py +++ b/python/tvm/target/target.py @@ -636,6 +636,8 @@ def hexagon(cpu_ver="v66", **kwargs): Whether to use QFloat HVX instructions. use_ieee_fp : bool (default: False) Whether to use IEEE HVX instructions + num_cores : int (default: 4) + The number of HVX threads. This attribute is required by meta scheduler. Note: Floating point support in HVX requires LLVM 14+. """ @@ -740,6 +742,9 @@ def create_llvm_options(cpu_ver, config): # pylint: disable=unused-argument args_list = target_str.split() + llvm_str.split() + num_cores = config["num_cores"] if "num_cores" in kwargs else 4 + args_list.append("--num-cores=%d" % num_cores) + return Target(" ".join(["hexagon"] + args_list)) diff --git a/python/tvm/tir/tensor_intrin/__init__.py b/python/tvm/tir/tensor_intrin/__init__.py index f0725b666e3b..7e5a26bdeb43 100644 --- a/python/tvm/tir/tensor_intrin/__init__.py +++ b/python/tvm/tir/tensor_intrin/__init__.py @@ -16,4 +16,4 @@ # under the License. # pylint: disable=unused-import """Intrinsics for tensorization.""" -from . import arm_cpu, cuda, rocm, x86 +from . import arm_cpu, cuda, rocm, x86, hexagon diff --git a/python/tvm/tir/tensor_intrin/hexagon.py b/python/tvm/tir/tensor_intrin/hexagon.py new file mode 100644 index 000000000000..0227312d6373 --- /dev/null +++ b/python/tvm/tir/tensor_intrin/hexagon.py @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,missing-function-docstring +"""Intrinsics for Hexagon tensorization.""" +from tvm.script import tir as T +from .. import TensorIntrin + + +@T.prim_func +def dot_product_32x4_u8u8i32_desc( + A: T.Buffer((4,), "uint8", offset_factor=1), + B: T.Buffer((32, 4), "uint8", offset_factor=1), + C: T.Buffer((32,), "int32", offset_factor=1), +) -> None: + with T.block("root"): + T.reads(C[0:32], A[0:4], B[0:32, 0:4]) + T.writes(C[0:32]) + for i in T.serial(0, 32): + with T.init(): + C[i] = T.int32(0) + for k in T.serial(0, 4): + with T.block("update"): + vi, vk = T.axis.remap("SR", [i, k]) + C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32") + + +@T.prim_func +def dot_product_32x4_u8u8i32_vrmpy( + A: T.Buffer((4,), "uint8", offset_factor=1), + B: T.Buffer((32, 4), "uint8", offset_factor=1), + C: T.Buffer((32,), "int32", offset_factor=1), +) -> None: + with T.block("root"): + T.reads(C[0:32], A[0:4], B[0:32, 0:4]) + T.writes(C[0:32]) + + A_u8x4 = A.vload([0], "uint8x4") + A_i32 = T.reinterpret(A_u8x4, dtype="int32") + + B_i8x128 = B.vload([0, 0], dtype="uint8x128") + B_i32x32 = T.reinterpret(B_i8x128, dtype="int32x32") + + C[T.ramp(T.int32(0), 1, 32)] = T.call_llvm_pure_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyub.acc.128B"), + T.uint32(3), + C[T.ramp(T.int32(0), 1, 32)], + B_i32x32, + A_i32, + dtype="int32x32", + ) + + +VRMPY_u8u8i32_INTRIN = "dot_32x4_u8u8i32_vrmpy" + +TensorIntrin.register( + VRMPY_u8u8i32_INTRIN, dot_product_32x4_u8u8i32_desc, dot_product_32x4_u8u8i32_vrmpy +) diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc index e3b2d7b096fd..a95f55357f2d 100644 --- a/src/target/target_kind.cc +++ b/src/target/target_kind.cc @@ -417,6 +417,7 @@ TVM_REGISTER_TARGET_KIND("hexagon", kDLHexagon) .add_attr_option("mcpu") .add_attr_option("mtriple") .add_attr_option>("llvm-options") + .add_attr_option("num-cores") .set_default_keys({"hexagon"}); TVM_REGISTER_TARGET_KIND("stackvm", kDLCPU); diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py new file mode 100644 index 000000000000..96d18c9b3076 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py @@ -0,0 +1,211 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" Test rpc based launcher for hexagon """ +import pytest +import numpy as np +import tempfile + +import tvm.testing +from tvm import te +from tvm import meta_schedule as ms +from tvm.meta_schedule.arg_info import TensorInfo +from tvm.meta_schedule.builder import BuilderInput +from tvm.script import tir as T +from tvm.tir import FloatImm +from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN +from tvm.meta_schedule.runner import RunnerInput +from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner + +MATMUL_N = 16 +MATMUL_M = 32 + + +@tvm.script.ir_module +class MatmulModule: + @T.prim_func + def main(a: T.handle, b: T.handle, c: T.handle) -> None: # pylint: disable=no-self-argument + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, (16, 16), "float32") + B = T.match_buffer(b, (16, 16), "float32") + C = T.match_buffer(c, (16, 16), "float32") + for i, j, k in T.grid(16, 16, 16): + with T.block("matmul"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + C[vi, vj] = 0.0 + C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj] + + +@tvm.testing.requires_hexagon +def test_builder_runner(hexagon_launcher): + if hexagon_launcher._serial_number == "simulator": + pytest.skip(msg="Tuning on simulator not supported.") + + target_hexagon = tvm.target.hexagon("v68", link_params=True) + target = tvm.target.Target(target_hexagon, host=target_hexagon) + mod = MatmulModule + + builder = get_hexagon_local_builder() + runner = get_hexagon_rpc_runner(hexagon_launcher, number=1, repeat=1, min_repeat_ms=0) + + (builder_result,) = builder.build([BuilderInput(mod, target)]) + assert builder_result.artifact_path is not None + assert builder_result.error_msg is None + + runner_input = RunnerInput( + builder_result.artifact_path, + "llvm", + [ + TensorInfo("float32", (MATMUL_N, MATMUL_N)), + TensorInfo("float32", (MATMUL_N, MATMUL_N)), + TensorInfo("float32", (MATMUL_N, MATMUL_N)), + ], + ) + + # Run the module + (runner_future,) = runner.run([runner_input]) + runner_result = runner_future.result() + + assert runner_result.error_msg is None + for result in runner_result.run_secs: + if isinstance(result, FloatImm): + result = result.value + assert isinstance(result, float) + assert result >= 0.0 + + +def dense(m, n, k): + X = te.placeholder((m, k), name="X", dtype="uint8") + packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW", dtype="uint8") + + ak = te.reduce_axis((0, k), name="k") + out = te.compute( + (m, n), + lambda i, j: te.sum( + X[i, ak].astype("int32") + * packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 32, ak % 4].astype( + "int32" + ), + axis=ak, + ), + name="compute", + ) + return [X, packedW, out] + + +def schedule_dense(sch, block, M, do_tune): + a_y, a_x, _ = sch.get_loops(block)[-3:] + + if do_tune: + y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128) + a_yo, a_yi = sch.split(a_y, factors=y_factors) + else: + a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)]) + + a_xo, a_xi = sch.split(a_x, factors=[None, 32]) + sch.reorder(a_yo, a_xo, a_yi, a_xi) + + a_xi, a_k = sch.get_loops(block)[-2:] + a_ko, a_ki = sch.split(a_k, factors=[None, 4]) + sch.reorder(a_ko, a_xi, a_ki) + + fused = sch.fuse(a_yo, a_xo) + + sch.parallel(fused) + + dec = sch.decompose_reduction(block, a_ko) + + init_loop = sch.get_loops(dec)[-1] + sch.vectorize(init_loop) + + sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN) + + +def verify_dense(sch, target, M, N, K, hexagon_session): + f = tvm.build(sch.mod["main"], target=target, name="dense") + mod = hexagon_session.load_module(f) + dev = hexagon_session.device + + a_np = np.random.uniform(1, 10, size=(M, K)).astype("uint8") + b_np = np.random.uniform(1, 10, size=(N, K)).astype("uint8") + c_np = np.dot(a_np.astype("int32"), b_np.transpose().astype("int32")) + + packW = np.random.uniform(1, 10, size=(N // 32, (K // 4), 32, 4)).astype("uint8") + + for r_idx in range(N // 32): + for ko in range(K // 4): + for s_idx in range(32): + for t_idx in range(4): + packW[r_idx][ko][s_idx][t_idx] = b_np[r_idx * 32 + s_idx][ko * 4 + t_idx] + + a = tvm.nd.array(a_np, dev) + b = tvm.nd.array(packW, dev) + c = tvm.nd.array(np.zeros((M, N), dtype="int32"), dev) + + mod(a, b, c) + np.testing.assert_equal(c.numpy(), c_np) + + evaluator = mod.time_evaluator(mod.entry_name, dev, number=10) + gflops = (N * M * K) * 2 / 1e9 + time_ms = evaluator(a, b, c).mean * 1e3 + print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3))) + + +@pytest.mark.skip(reason="xgboost not installed on CI") +@tvm.testing.requires_hexagon +def test_vrmpy_dense(hexagon_launcher): + if hexagon_launcher._serial_number == "simulator": + pytest.skip(msg="Tuning on simulator not supported.") + + do_tune = True + target_hexagon = tvm.target.hexagon("v68") + target = tvm.target.Target(target_hexagon, host=target_hexagon) + + M, N, K = 128, 768, 768 + workload = te.create_prim_func(dense(M, N, K)) + + if not do_tune: + ir_module = tvm.IRModule({"main": workload}) + sch = tvm.tir.Schedule(ir_module) + block = sch.get_block("compute") + schedule_dense(sch, block, M, do_tune) + else: + with tempfile.TemporaryDirectory() as work_dir: + config = ms.TuneConfig( + strategy="replay_trace", + num_trials_per_iter=8, + max_trials_per_task=8, + max_trials_global=8, + ) + + def schedule_dense_for_tune(sch): + block = sch.get_block("compute") + return schedule_dense(sch, block, None, True) + + sch = ms.tune_tir( + mod=workload, + target=target, + config=config, + work_dir=work_dir, + space=ms.space_generator.ScheduleFn(schedule_dense_for_tune), + builder=get_hexagon_local_builder(), + runner=get_hexagon_rpc_runner(hexagon_launcher, number=10), + ) + + with hexagon_launcher.start_session() as session: + verify_dense(sch, target, M, N, K, session) From 49b3c72935b290afa9eee1f1c57a4b4c2f10a445 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Fri, 26 Aug 2022 10:15:54 -0700 Subject: [PATCH 057/704] [TIR] More hygenic TVM_SREF macros (#12607) Previously, the `TVM_SREF_TO_BLOCK`, `TVM_SREF_TO_FOR`, and `TVM_TYPE_AS` macros required both the input and output variables. The input variable name is useful for improving the error message returned, but the output variable name isn't necessary for this functionality, and prevents the macro from being used as part of an expression. * Generate an immediately-invoked lambda expression to allow for an independently-scoped `result` variable. * Use parentheses around the input argument, in case the sref is the result of an expression. * Update all call sites to remove the macro argument providing the first argument. --- src/meta_schedule/mutator/mutate_parallel.cc | 4 +- .../mutator/mutate_thread_binding.cc | 8 +-- src/meta_schedule/mutator/mutate_tile_size.cc | 4 +- src/meta_schedule/mutator/mutate_unroll.cc | 4 +- .../rewrite_parallel_vectorize_unroll.cc | 4 +- src/meta_schedule/schedule_rule/auto_bind.cc | 2 +- .../schedule_rule/auto_inline.cc | 2 +- .../schedule_rule/multi_level_tiling.cc | 2 +- .../multi_level_tiling_tensor_core.cc | 4 +- .../schedule_rule/random_compute_location.cc | 2 +- src/meta_schedule/utils.h | 2 +- src/tir/schedule/analysis/analysis.cc | 48 ++++++++--------- src/tir/schedule/block_scope.cc | 2 +- src/tir/schedule/concrete_schedule.cc | 4 +- src/tir/schedule/concrete_schedule.h | 6 +-- src/tir/schedule/primitive/block_annotate.cc | 6 +-- .../schedule/primitive/blockize_tensorize.cc | 2 +- .../schedule/primitive/cache_read_write.cc | 14 ++--- src/tir/schedule/primitive/compute_at.cc | 12 ++--- src/tir/schedule/primitive/compute_inline.cc | 8 +-- .../schedule/primitive/decompose_padding.cc | 2 +- src/tir/schedule/primitive/for_kind.cc | 4 +- src/tir/schedule/primitive/get_block_loop.cc | 2 +- .../primitive/layout_transformation.cc | 10 ++-- .../schedule/primitive/loop_transformation.cc | 10 ++-- src/tir/schedule/primitive/reduction.cc | 12 ++--- src/tir/schedule/primitive/sampling.cc | 2 +- src/tir/schedule/state.cc | 14 ++--- src/tir/schedule/transform.cc | 6 +-- src/tir/schedule/utils.h | 51 ++++++++++++------- 30 files changed, 133 insertions(+), 120 deletions(-) diff --git a/src/meta_schedule/mutator/mutate_parallel.cc b/src/meta_schedule/mutator/mutate_parallel.cc index 5b7fe7f5148d..82b91da682c6 100644 --- a/src/meta_schedule/mutator/mutate_parallel.cc +++ b/src/meta_schedule/mutator/mutate_parallel.cc @@ -64,7 +64,7 @@ const BlockRVNode* GetInstGetBlockOutput(const Instruction& inst) { return nullptr; } ICHECK_EQ(inst->outputs.size(), 1); - const BlockRVNode* block = TVM_TYPE_AS(block, inst->outputs[0], BlockRVNode); + const BlockRVNode* block = TVM_TYPE_AS(inst->outputs[0], BlockRVNode); return block; } @@ -82,7 +82,7 @@ std::vector> AnalyzeParallel(const ScheduleState& self, Array block_srefs = tir::GetBlocks(self, block_name, self->mod->GetGlobalVar(func_name)); ICHECK_EQ(block_srefs.size(), 1); - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_srefs[0]); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_srefs[0]); ScopeBlockLoopInfo info = GetScopeBlockLoopInfo(GetRef(block)); std::vector> results; results.reserve(info.realizes.size()); diff --git a/src/meta_schedule/mutator/mutate_thread_binding.cc b/src/meta_schedule/mutator/mutate_thread_binding.cc index 41207162ee1d..de780b53e2d9 100644 --- a/src/meta_schedule/mutator/mutate_thread_binding.cc +++ b/src/meta_schedule/mutator/mutate_thread_binding.cc @@ -109,12 +109,12 @@ std::vector MutateThreadBindingNode::FindCan for (const Instruction& inst : trace->insts) { if (inst->kind.same_as(inst_sample_categorical)) { ICHECK_EQ(inst->outputs.size(), 1); - const PrimExprNode* var_rv = TVM_TYPE_AS(var_rv, inst->outputs[0], PrimExprNode); + const PrimExprNode* var_rv = TVM_TYPE_AS(inst->outputs[0], PrimExprNode); sample_insts[var_rv] = inst.get(); } else if (is_split_by_sample(inst)) { CHECK_EQ(inst->outputs.size(), 2); // Only consider the inner loop, which can be bound to threadIdx.x - const tir::LoopRVNode* var_rv = TVM_TYPE_AS(var_rv, inst->outputs[1], tir::LoopRVNode); + const tir::LoopRVNode* var_rv = TVM_TYPE_AS(inst->outputs[1], tir::LoopRVNode); sampled_split_insts[var_rv] = inst.get(); } else if (is_thread_binding_by_sample(inst)) { bind_insts.push_back(inst.get()); @@ -122,12 +122,12 @@ std::vector MutateThreadBindingNode::FindCan } for (const InstructionNode* bind_inst : bind_insts) { - const auto* loop_rv = TVM_TYPE_AS(loop_rv, bind_inst->inputs[0], tir::LoopRVNode); + const auto* loop_rv = TVM_TYPE_AS(bind_inst->inputs[0], tir::LoopRVNode); auto split_it = sampled_split_insts.find(loop_rv); ICHECK(split_it != sampled_split_insts.end()); const InstructionNode* split_inst = split_it->second; - const auto* expr_rv = TVM_TYPE_AS(expr_rv, split_inst->inputs[2], PrimExprNode); + const auto* expr_rv = TVM_TYPE_AS(split_inst->inputs[2], PrimExprNode); auto sample_it = sample_insts.find(expr_rv); ICHECK(sample_it != sample_insts.end()); const InstructionNode* sample_inst = sample_it->second; diff --git a/src/meta_schedule/mutator/mutate_tile_size.cc b/src/meta_schedule/mutator/mutate_tile_size.cc index 00967aef7acd..4a3bfda8a4a8 100644 --- a/src/meta_schedule/mutator/mutate_tile_size.cc +++ b/src/meta_schedule/mutator/mutate_tile_size.cc @@ -34,7 +34,7 @@ using tir::Trace; * \return The result of downcast */ std::vector DowncastTilingDecision(const ObjectRef& decision) { - const auto* arr = TVM_TYPE_AS(arr, decision, runtime::ArrayNode); + const auto* arr = TVM_TYPE_AS(decision, runtime::ArrayNode); return support::AsVector(GetRef>(arr)); } @@ -123,7 +123,7 @@ void FindSampleVectorize(const Trace& trace, std::vector* inst, if (inst->kind.same_as(inst_sample_categorical)) { ICHECK_EQ(inst->outputs.size(), 1); if (annotated.count(inst->outputs[0].get())) { - const auto* d = TVM_TYPE_AS(d, decision, IntImmNode); + const auto* d = TVM_TYPE_AS(decision, IntImmNode); instructions.push_back(inst); decisions.push_back(d->value); } diff --git a/src/meta_schedule/mutator/mutate_unroll.cc b/src/meta_schedule/mutator/mutate_unroll.cc index 94e83488584e..c282a171c3b7 100644 --- a/src/meta_schedule/mutator/mutate_unroll.cc +++ b/src/meta_schedule/mutator/mutate_unroll.cc @@ -91,7 +91,7 @@ bool FindUnrollDecision(const Trace& trace, TRandState* rand_state, for (const Instruction& inst : trace->insts) { if (inst->kind.same_as(inst_sample_categorical)) { ICHECK_EQ(inst->outputs.size(), 1); - const PrimExprNode* var_rv = TVM_TYPE_AS(var_rv, inst->outputs[0], PrimExprNode); + const PrimExprNode* var_rv = TVM_TYPE_AS(inst->outputs[0], PrimExprNode); sample_insts[var_rv] = inst.get(); } else if (IsAnnotateWithUnroll(inst)) { ann_insts.push_back(inst.get()); @@ -103,7 +103,7 @@ bool FindUnrollDecision(const Trace& trace, TRandState* rand_state, } const InstructionNode* ann_inst = ann_insts[tir::SampleInt(rand_state, 0, n_ann_insts)]; ICHECK_EQ(ann_inst->inputs.size(), 2); - const auto* var_rv = TVM_TYPE_AS(var_rv, ann_inst->inputs[1], PrimExprNode); + const auto* var_rv = TVM_TYPE_AS(ann_inst->inputs[1], PrimExprNode); ICHECK(sample_insts.count(var_rv)); const InstructionNode* sample_inst = sample_insts.at(var_rv); ICHECK_EQ(sample_inst->attrs.size(), 2); diff --git a/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc b/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc index f3c2b1328bc3..08d25d017840 100644 --- a/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc +++ b/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc @@ -233,7 +233,7 @@ void AdjustParallelVectorize(const Schedule& sch, const BlockRV& block_rv, int64_t prod_extent = 1; for (int i = 0; i < n_loops && loop_types[i] == IterVarType::kDataPar; ++i) { const StmtSRef& loop_sref = loop_srefs[i]; - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); if (HasAnnOrBinding(loop)) { break; } @@ -262,7 +262,7 @@ void AdjustParallelVectorize(const Schedule& sch, const BlockRV& block_rv, for (int i = n_loops - 1; i >= 0 && loop_types[i] == IterVarType::kDataPar && num_fusible < max_fusible; --i) { const StmtSRef& loop_sref = loop_srefs[i]; - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); if (HasAnnOrBinding(loop)) { break; } diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc index ff4d26084e57..d8f52fa8e1de 100644 --- a/src/meta_schedule/schedule_rule/auto_bind.cc +++ b/src/meta_schedule/schedule_rule/auto_bind.cc @@ -45,7 +45,7 @@ void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block_rv, int i_spatial_loop = -1; for (int i = 0; i < n; ++i) { const StmtSRef& loop_sref = loops[i]; - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); runtime::ThreadScope thread_scope = GetThreadScope(loop); if (IsBlockIdx(thread_scope)) { if (i_block_idx == -1) { diff --git a/src/meta_schedule/schedule_rule/auto_inline.cc b/src/meta_schedule/schedule_rule/auto_inline.cc index df4d3ac85911..76313f46d1c8 100644 --- a/src/meta_schedule/schedule_rule/auto_inline.cc +++ b/src/meta_schedule/schedule_rule/auto_inline.cc @@ -96,7 +96,7 @@ inline InlineType AutoInlineNode::CheckInline(const tir::Schedule& sch, StmtSRef block_sref = sch->GetSRef(block_rv); bool is_pure_sptial = IsInSpatialPrimFunc(sch, block_sref); ScheduleState state = sch->state(); - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); BlockRealize realize = GetBlockRealize(state, block_sref); // Cond 1. The block has only one write buffer if (block->writes.size() != 1) { diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc index eefc2eea411b..c126c854462c 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc @@ -37,7 +37,7 @@ namespace tir { * of multi-level tiling, so it's intentionally kept inside this file not in the analysis header */ std::vector GetReadBufferNDims(const StmtSRef& block_sref) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); const BufferNode* write_buffer = block->writes[0]->buffer.get(); int n = block->reads.size(); std::vector results(n, -1); diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc index 49704fb66b15..7ddda9b2635b 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc @@ -411,7 +411,7 @@ Optional MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin( tir::StmtSRef block_sref = state->sch->GetSRef(state->block_rv); // Add reindex stages - const tir::BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const tir::BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); // Hold the reference of the block before reindex const tir::Block block_before_reindex = GetRef(block); if (block->reads.size() != 2 || block->writes.size() != 1) { @@ -488,7 +488,7 @@ Optional MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin( } visited_buffers.insert(lhs_buffer); // Refresh block pointer (block sref is not invalidated) - block = TVM_SREF_TO_BLOCK(block, block_sref); + block = TVM_SREF_TO_BLOCK(block_sref); const tir::BufferRegion& reindexed_buffer_region = tir::GetNthAccessBufferRegion( state->sch->state(), GetRef(block), buffer_index, index_type); auto sub_index_map = f_get_sub_index_map(lhs_buffer, reindexed_buffer_region->region); diff --git a/src/meta_schedule/schedule_rule/random_compute_location.cc b/src/meta_schedule/schedule_rule/random_compute_location.cc index e4b5d5bde256..65988dfd5688 100644 --- a/src/meta_schedule/schedule_rule/random_compute_location.cc +++ b/src/meta_schedule/schedule_rule/random_compute_location.cc @@ -60,7 +60,7 @@ class RandomComputeLocationNode : public ScheduleRuleNode { private: bool CheckConditions(const tir::Schedule sch, const tir::BlockRV& block_rv) const { tir::StmtSRef block_sref = sch->GetSRef(block_rv); - const tir::BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + TVM_SREF_TO_BLOCK(block_sref); // Cond 1. The block is not the root block. if (block_sref->parent == nullptr) { diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h index cb84596eed11..664a6a609e7f 100644 --- a/src/meta_schedule/utils.h +++ b/src/meta_schedule/utils.h @@ -238,7 +238,7 @@ inline std::string Concat(const Array& strs, const std::string& delim) { */ inline tir::BlockRV GetRVFromSRef(const tir::Schedule& sch, const tir::StmtSRef& block_sref, const String& global_var_name) { - const tir::BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const tir::BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); return sch->GetBlock(block->name_hint, global_var_name); } diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc index 62ec0b468f9d..b9e99257f37c 100644 --- a/src/tir/schedule/analysis/analysis.cc +++ b/src/tir/schedule/analysis/analysis.cc @@ -150,7 +150,7 @@ Definition of a scope that is a stage pipeline: if (require_stage_pipeline) { bool stage_pipeline = self->GetBlockInfo(scope_root_sref).scope->stage_pipeline; if (stage_pipeline == false) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, scope_root_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(scope_root_sref); throw NotStagePipelineError(self->mod, GetRef(block)); } } @@ -229,7 +229,7 @@ bool IsDominantBlock(const ScheduleState& self, const StmtSRef& scope_root_sref, } } // Check whether the input block is the only writer of its outputs - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); for (const BufferRegion& write_region : block->writes) { if (buffer_writers.count(write_region->buffer)) { if (buffer_writers.at(write_region->buffer).size() != 1) { @@ -252,7 +252,7 @@ bool IsDominantBlock(const ScheduleState& self, const StmtSRef& scope_root_sref, int CheckCompleteBlockErrorCode(const ScheduleState& self, const StmtSRef& block_sref, const StmtSRef& scope_root_sref) { // Cond 1. All block vars are data parallel - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); for (const IterVar& iter_var : block->iter_vars) { if (iter_var->iter_type != kDataPar) { return 1; @@ -328,7 +328,7 @@ void CheckCompleteBlock(const ScheduleState& self, const StmtSRef& block_sref, int error_code = CheckCompleteBlockErrorCode(self, block_sref, scope_root_sref); if (error_code != 0) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); throw IncompleteBlockError(self->mod, GetRef(block), error_code); } } @@ -344,7 +344,7 @@ void CheckCompleteBlock(const ScheduleState& self, const StmtSRef& block_sref, */ int CheckReductionBlockErrorCode(const ScheduleState& self, const StmtSRef& block_sref, const StmtSRef& scope_root_sref) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); // Cond 1. The block has the `init` statement. if (!block->init.defined()) { return 1; @@ -394,7 +394,7 @@ void CheckReductionBlock(const ScheduleState& self, const StmtSRef& block_sref, int error_code = CheckReductionBlockErrorCode(self, block_sref, scope_root_sref); if (error_code != 0) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); throw NotReductionBlockError(self->mod, GetRef(block), error_code); } } @@ -441,7 +441,7 @@ void CheckCompleteOrReductionBlock(const ScheduleState& self, const StmtSRef& bl if (reduction_block_error_code == 0) { return; } - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); throw NotCompleteOrReductionBlockError(self->mod, GetRef(block), complete_block_error_code, reduction_block_error_code); } @@ -491,7 +491,7 @@ void CheckSubtreeCompactDataflow(const ScheduleState& self, const StmtSRef& subt int local_complete_block_code = CheckCompleteBlockErrorCode(self, block_sref, subtree_root), local_reduction_block_code = CheckReductionBlockErrorCode(self, block_sref, subtree_root); if (local_complete_block_code != 0 && local_reduction_block_code != 0) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); throw NotCompactDataFlowError(self->mod, GetRef(subtree_root->stmt), GetRef(block), local_complete_block_code, local_reduction_block_code); @@ -501,8 +501,8 @@ void CheckSubtreeCompactDataflow(const ScheduleState& self, const StmtSRef& subt bool IsOutputBlock(const ScheduleState& self, const StmtSRef& block_sref, const StmtSRef& scope_root_sref) { - const BlockNode* scope_root = TVM_SREF_TO_BLOCK(scope_root, scope_root_sref); - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* scope_root = TVM_SREF_TO_BLOCK(scope_root_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); std::unordered_set scope_allocated; scope_allocated.reserve(scope_root->alloc_buffers.size()); for (const Buffer& buffer : scope_root->alloc_buffers) { @@ -532,7 +532,7 @@ void CheckNotOutputBlock(const ScheduleState& self, const StmtSRef& block_sref, Block block_; }; if (IsOutputBlock(self, block_sref, scope_root_sref)) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); throw OutputBlockError(self->mod, GetRef(block)); } } @@ -547,12 +547,12 @@ std::vector GetBlockVarTypes(const BlockNode* block) { } std::vector GetBlockVarTypes(const StmtSRef& block_sref) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); return GetBlockVarTypes(block); } bool IsWriteCache(const StmtSRef& block_sref) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); if (block->writes.size() != 1) { return false; } @@ -751,7 +751,7 @@ void CheckLoopStartsWithZero(const ScheduleState& self, const StmtSRef& loop_sre IRModule mod_; For loop_; }; - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); if (!analyzer->CanProve(loop->min == 0)) { throw LoopNotStartWithZeroError(self->mod, GetRef(loop)); } @@ -856,7 +856,7 @@ BlockRealize GetBlockRealize(const ScheduleState& self, const StmtSRef& block_sr const BlockRealizeNode* result; }; - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); if (block_sref->parent == nullptr) { const PrimFuncNode* func = GetRootPrimFunc(self->mod, block, nullptr); return Downcast(func->body); @@ -870,7 +870,7 @@ BlockRealize GetBlockRealize(const ScheduleState& self, const StmtSRef& block_sr } IterVarType GetLoopIterType(const StmtSRef& loop_sref) { - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); const Var& loop_var = loop->loop_var; int n_spatial = 0; int n_reduce = 0; @@ -1924,7 +1924,7 @@ void CheckStorageScope(const ScheduleState& self, String storage_scope) { } bool IsSpatial(const StmtSRef& block_sref) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); for (const IterVar& iter_var : block->iter_vars) { if (iter_var->iter_type != IterVarType::kDataPar) { return false; @@ -1934,14 +1934,14 @@ bool IsSpatial(const StmtSRef& block_sref) { } bool IsTrivialBinding(const ScheduleState& self, const StmtSRef& block_sref) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + TVM_SREF_TO_BLOCK(block_sref); Array loops = GetLoops(block_sref); Array binds = GetBlockRealize(self, block_sref)->iter_values; if (loops.size() != binds.size()) { return false; } for (int i = 0, n = loops.size(); i < n; ++i) { - const ForNode* loop = TVM_SREF_TO_FOR(loop, loops[i]); + const ForNode* loop = TVM_SREF_TO_FOR(loops[i]); if (binds[i].get() != loop->loop_var.get()) { return false; } @@ -1953,7 +1953,7 @@ bool NeedsMultiLevelTiling(const ScheduleState& self, const StmtSRef& block_sref if (HasBeenMultiLevelTiled(block_sref)) { return false; } - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); if (block->writes.size() != 1 || block->reads.empty() || IsSpatial(block_sref) || !IsTrivialBinding(self, block_sref)) { return false; @@ -2065,7 +2065,7 @@ bool NeedsRFactorOrCrossThreadReduction(const tir::ScheduleState& self, // const tir::StmtSRef& block_sref, // int64_t max_parallel_extent, // int64_t max_parallel_basic) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); Array loops = tir::GetLoops(block_sref); // Cond 1. The block has only one write buffer @@ -2100,9 +2100,9 @@ bool NeedsRFactorOrCrossThreadReduction(const tir::ScheduleState& self, // } // Cond 5. - const ForNode* loop_i = TVM_SREF_TO_FOR(loop_i, loops[i]); + const ForNode* loop_i = TVM_SREF_TO_FOR(loops[i]); if (i < loops.size() - 1) { - const ForNode* loop_i1 = TVM_SREF_TO_FOR(loop_i1, loops[i + 1]); + const ForNode* loop_i1 = TVM_SREF_TO_FOR(loops[i + 1]); if (loop_i->body.get() != loop_i1) { return false; } @@ -2194,7 +2194,7 @@ Optional GetTensorizeLoopMapping(const tir::ScheduleState& self, TensorIntrinDescInfo desc_info = ExtractTensorIntrinDescInfo(&analyzer, desc_func); // Step 2. Collect loops from block_sref const tir::StmtSRef& scope_sref = GetScopeRoot(self, block_sref, false); - const tir::BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref); + TVM_SREF_TO_BLOCK(scope_sref); std::vector block_loops; std::unordered_set block_loop_vars; { diff --git a/src/tir/schedule/block_scope.cc b/src/tir/schedule/block_scope.cc index f1ce65e48e03..31452f4a8f15 100644 --- a/src/tir/schedule/block_scope.cc +++ b/src/tir/schedule/block_scope.cc @@ -76,7 +76,7 @@ BlockScope::BlockScope(const Array& child_block_srefs) { SMap> buffer_readers; SMap>& buffer_writers = n->buffer_writers; for (const StmtSRef& child_block_sref : child_block_srefs) { - const BlockNode* child_block = TVM_SREF_TO_BLOCK(child_block, child_block_sref); + const BlockNode* child_block = TVM_SREF_TO_BLOCK(child_block_sref); // Step 1. Update `buffer_readers` and `buffer_writers` for each buffer for (const BufferRegion& region : child_block->reads) { buffer_readers[region->buffer].push_back(child_block_sref); diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc index 5f773a02d6ff..afc675799706 100644 --- a/src/tir/schedule/concrete_schedule.cc +++ b/src/tir/schedule/concrete_schedule.cc @@ -269,7 +269,7 @@ BlockRV ConcreteScheduleNode::GetBlock(const String& name, const Optional(block)); } } @@ -432,7 +432,7 @@ Array ConcreteScheduleNode::Split(const LoopRV& loop_rv, // Prepare for the splitting StmtSRef loop_sref = this->GetSRef(loop_rv); - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); Array factors; factors.reserve(factor_rvs.size()); int infer_index = -1; diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h index 92b9de408873..e79d1d528809 100644 --- a/src/tir/schedule/concrete_schedule.h +++ b/src/tir/schedule/concrete_schedule.h @@ -206,13 +206,13 @@ class ConcreteScheduleNode : public ScheduleNode { inline Block ConcreteScheduleNode::Get(const BlockRV& block_rv) const { StmtSRef sref = this->GetSRef(block_rv); - const BlockNode* block = TVM_SREF_TO_BLOCK(block, sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(sref); return GetRef(block); } inline For ConcreteScheduleNode::Get(const LoopRV& loop_rv) const { StmtSRef sref = this->GetSRef(loop_rv); - const ForNode* loop = TVM_SREF_TO_FOR(loop, sref); + const ForNode* loop = TVM_SREF_TO_FOR(sref); return GetRef(loop); } @@ -223,7 +223,7 @@ inline PrimExpr ConcreteScheduleNode::Get(const ExprRV& expr_rv) const { LOG(FATAL) << "IndexError: Cannot find corresponding ExprRV: " << var; } const ObjectRef& obj = (*it).second; - const auto* int_imm = TVM_TYPE_AS(int_imm, obj, IntImmNode); + const auto* int_imm = TVM_TYPE_AS(obj, IntImmNode); return Integer(int_imm->value); }); return this->analyzer_->Simplify(transformed); diff --git a/src/tir/schedule/primitive/block_annotate.cc b/src/tir/schedule/primitive/block_annotate.cc index 2d876d9bf7fa..31c938313fed 100644 --- a/src/tir/schedule/primitive/block_annotate.cc +++ b/src/tir/schedule/primitive/block_annotate.cc @@ -238,7 +238,7 @@ class StorageScopeMutator : private ReplaceBufferMutator { void StorageAlign(ScheduleState self, const StmtSRef& block_sref, int buffer_index, int axis, int factor, int offset) { - const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref); + const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref); Buffer buffer = GetNthAccessBuffer(self, GetRef(block_ptr), buffer_index, BufferIndexType::kWrite); StorageAlignInvalidFactorError::Check(self->mod, factor); @@ -274,7 +274,7 @@ void StorageAlign(ScheduleState self, const StmtSRef& block_sref, int buffer_ind void SetScope(ScheduleState self, const StmtSRef& block_sref, int buffer_index, const String& storage_scope) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); Buffer buffer = GetNthAccessBuffer(self, GetRef(block), buffer_index, BufferIndexType::kWrite); @@ -289,7 +289,7 @@ void SetScope(ScheduleState self, const StmtSRef& block_sref, int buffer_index, // Step 3. Get the allocation site of the target buffer. StmtSRef alloc_site_sref = NonAllocatedBufferError::CheckAndGetBufferAllocationSite(self->mod, block_sref, buffer); - const BlockNode* alloc_site = TVM_SREF_TO_BLOCK(alloc_site, alloc_site_sref); + const BlockNode* alloc_site = TVM_SREF_TO_BLOCK(alloc_site_sref); // Step 4. Recursively replace the old buffer to a new buffer, where the new buffer has the given // storage scope. In the meanwhile, collect the block sref reuse information. diff --git a/src/tir/schedule/primitive/blockize_tensorize.cc b/src/tir/schedule/primitive/blockize_tensorize.cc index cf6532e82d46..7481a7c92494 100644 --- a/src/tir/schedule/primitive/blockize_tensorize.cc +++ b/src/tir/schedule/primitive/blockize_tensorize.cc @@ -426,7 +426,7 @@ Stmt MakeLoopNest(Stmt stmt, const std::vector& loops) { BlockRealize BlockizeImpl(const ScheduleState& self, const StmtSRef& loop_sref, Map* block_sref_reuse, arith::Analyzer* analyzer) { - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + TVM_SREF_TO_FOR(loop_sref); // Step 1: Check and get the only block under `loop`. BlockRealize block_realize = CheckGetSingleChildBlockRealizeOnSRefTree(self, loop_sref); Block block = block_realize->block; diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc index 529d3333cd18..a221733eb394 100644 --- a/src/tir/schedule/primitive/cache_read_write.cc +++ b/src/tir/schedule/primitive/cache_read_write.cc @@ -31,7 +31,7 @@ class NotSingleWriteBlock : public ScheduleError { ICHECK_GT(write_blocks.size(), 1); write_blocks_.reserve(write_blocks.size()); for (const StmtSRef& block_sref : write_blocks) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); write_blocks_.push_back(GetRef(block)); } } @@ -532,7 +532,7 @@ class CacheReadRewriter : public StmtExprMutator { bool is_consumer = info_->consumer_blocks.empty(); // Otherwise check if this is one of the specified blocks. for (StmtSRef consumer_sref : info_->consumer_blocks) { - const BlockNode* consumer_node = TVM_SREF_TO_BLOCK(consumer_node, consumer_sref); + const BlockNode* consumer_node = TVM_SREF_TO_BLOCK(consumer_sref); Block consumer_block = GetRef(consumer_node); if (old_stmt.same_as(consumer_block)) { is_consumer = true; @@ -999,11 +999,11 @@ StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buff CheckStorageScope(self, storage_scope); // Step 1. Check index, getting the target buffer and the parent scope - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); Buffer read_buffer = GetNthAccessBuffer(self, GetRef(block), read_buffer_index, BufferIndexType::kRead); StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true); - const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref); + const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_sref); // Step 2. Create CacheStageInfo CacheStageInfo info; @@ -1020,7 +1020,7 @@ StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buff if (Optional _write_block_sref = GetOnlyWriteBlock(self, scope_sref, read_buffer)) { // Case 1. The buffer is written inside the block. StmtSRef write_block_sref = _write_block_sref.value(); - const BlockNode* write_block = TVM_SREF_TO_BLOCK(write_block, write_block_sref); + const BlockNode* write_block = TVM_SREF_TO_BLOCK(write_block_sref); // Find the producing region BufferRegion region = GetBufferRegionFromBuffer(write_block->writes, read_buffer).value(); StmtSRef parent_sref = GetRef(write_block_sref->parent); @@ -1072,7 +1072,7 @@ StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_bu CheckStorageScope(self, storage_scope); // Step 1. Checking index, getting the target buffer and the parent scope - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); Buffer write_buffer = GetNthAccessBuffer(self, GetRef(block), write_buffer_index, BufferIndexType::kWrite); StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true); @@ -1114,7 +1114,7 @@ StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_bu StmtSRef ReIndex(ScheduleState self, const StmtSRef& block_sref, int buffer_index, BufferIndexType buffer_index_type) { - const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref); + const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref); Block block = GetRef(block_ptr); Buffer buffer = GetNthAccessBuffer(self, block, buffer_index, buffer_index_type); StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true); diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc index 8baedfd70dd0..83342e351b91 100644 --- a/src/tir/schedule/primitive/compute_at.cc +++ b/src/tir/schedule/primitive/compute_at.cc @@ -37,7 +37,7 @@ class NotAllRequiredBlocksAreVisitedError : public ScheduleError { : mod_(mod), num_not_visited_(num_not_visited) { required_.reserve(required.size()); for (const StmtSRef& block_sref : required) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); required_.push_back(GetRef(block)); } } @@ -306,14 +306,14 @@ class ScopeReconstructor : private StmtMutator { return GetRef(block); } if (block == rm_src_stmt_.get()) { - block = TVM_TYPE_AS(block, rm_tgt_stmt_, BlockNode); + block = TVM_TYPE_AS(rm_tgt_stmt_, BlockNode); } return StmtMutator::VisitStmt_(block); } Stmt VisitStmt_(const ForNode* loop) final { if (loop == rm_src_stmt_.get()) { - loop = TVM_TYPE_AS(loop, rm_tgt_stmt_, ForNode); + loop = TVM_TYPE_AS(rm_tgt_stmt_, ForNode); } if (loop == loop_.get()) { return new_loop_; @@ -559,7 +559,7 @@ void CalculateProvidedRequiredRegions( } // Step 2. Calculate the region required by dependent blocks under `loop` for (const StmtSRef& required_block_sref : is_compute_at ? consumer_srefs : producer_srefs) { - const BlockNode* required_block = TVM_SREF_TO_BLOCK(required_block, required_block_sref); + const BlockNode* required_block = TVM_SREF_TO_BLOCK(required_block_sref); ICHECK(block2realize.count(required_block)); RelaxBufferRegions( /*binding=*/GetBindings(GetRef(block2realize.at(required_block))), @@ -576,8 +576,8 @@ void ComputeAtOrReverseComputeAtImpl(ScheduleState self, const StmtSRef& block_s const StmtSRef& loop_sref, bool preserve_unit_loops, arith::Analyzer* analyzer, bool check_only = false, int index = -1) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); // Step 1. Bunch of checks // Check condition 1) : scope stage pipeline StmtSRef scope_root_sref = GetScopeRoot(self, block_sref, diff --git a/src/tir/schedule/primitive/compute_inline.cc b/src/tir/schedule/primitive/compute_inline.cc index ad15e06e285a..bfda66036fe3 100644 --- a/src/tir/schedule/primitive/compute_inline.cc +++ b/src/tir/schedule/primitive/compute_inline.cc @@ -174,7 +174,7 @@ class NonSingleProducerError : public ScheduleError { } } } - const BlockNode* block = TVM_SREF_TO_BLOCK(block, consumer_block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(consumer_block_sref); throw NonSingleProducerError(self->mod, GetRef(block)); } }; @@ -183,7 +183,7 @@ class OpaqueAccessError : public ScheduleError { public: explicit OpaqueAccessError(IRModule mod, StmtSRef scope_root_sref) : mod_(mod), scope_root_(nullptr) { - const BlockNode* scope_root = TVM_SREF_TO_BLOCK(scope_root, scope_root_sref); + const BlockNode* scope_root = TVM_SREF_TO_BLOCK(scope_root_sref); this->scope_root_ = GetRef(scope_root); } @@ -653,7 +653,7 @@ class ReverseComputeInliner : public BaseInliner { void ComputeInlineImpl(ScheduleState self, const StmtSRef& producer_block_sref, bool check_only = false) { - const BlockNode* _producer_block = TVM_SREF_TO_BLOCK(_producer_block, producer_block_sref); + const BlockNode* _producer_block = TVM_SREF_TO_BLOCK(producer_block_sref); Block producer_block = GetRef(_producer_block); HasInitBlock::Check(self->mod, producer_block); Buffer inlined_buffer = NotSingleReadWriteBuffer::GetSingleWrite(self, producer_block); @@ -698,7 +698,7 @@ bool CanComputeInline(const ScheduleState& self, const StmtSRef& producer_block_ void ReverseComputeInlineImpl(ScheduleState self, const StmtSRef& consumer_block_sref, bool check_only = false) { - const BlockNode* _consumer_block = TVM_SREF_TO_BLOCK(_consumer_block, consumer_block_sref); + const BlockNode* _consumer_block = TVM_SREF_TO_BLOCK(consumer_block_sref); Block consumer_block = GetRef(_consumer_block); HasInitBlock::Check(self->mod, consumer_block); // Step 1. Get the scope block diff --git a/src/tir/schedule/primitive/decompose_padding.cc b/src/tir/schedule/primitive/decompose_padding.cc index 365c6d43f127..93fb88e66619 100644 --- a/src/tir/schedule/primitive/decompose_padding.cc +++ b/src/tir/schedule/primitive/decompose_padding.cc @@ -415,7 +415,7 @@ StmtSRef DecomposePaddingImpl(ScheduleState self, const StmtSRef& block_sref, * - trim original block to write non-padding part only */ // Condition Checks and Information Collection - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); const BlockRealizeNode* realize = GetBlockRealize(self, block_sref).get(); Map dom_map; arith::Analyzer analyzer; diff --git a/src/tir/schedule/primitive/for_kind.cc b/src/tir/schedule/primitive/for_kind.cc index ec337224e59d..cc8cb55fd3fa 100644 --- a/src/tir/schedule/primitive/for_kind.cc +++ b/src/tir/schedule/primitive/for_kind.cc @@ -145,7 +145,7 @@ void CheckParallelizability(const ScheduleState& self, const For& loop, ForKind */ void ParallelizeComputation(const ScheduleState& self, const StmtSRef& loop_sref, ForKind for_kind, Optional thread_axis) { - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); /* * Check: @@ -186,7 +186,7 @@ void Bind(ScheduleState self, const StmtSRef& loop_sref, const IterVar& thread_a } void Unroll(ScheduleState self, const StmtSRef& loop_sref) { - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); ObjectPtr new_loop = make_object(*loop); new_loop->kind = ForKind::kUnrolled; new_loop->thread_binding = NullOpt; diff --git a/src/tir/schedule/primitive/get_block_loop.cc b/src/tir/schedule/primitive/get_block_loop.cc index 746918ac4e34..cbdb99c6444f 100644 --- a/src/tir/schedule/primitive/get_block_loop.cc +++ b/src/tir/schedule/primitive/get_block_loop.cc @@ -40,7 +40,7 @@ Array GetBlocks(const ScheduleState& self, const String& name, const G }; BaseFunc func = self->mod->Lookup(gv); - const auto* prim_func = TVM_TYPE_AS(prim_func, func, PrimFuncNode); + const auto* prim_func = TVM_TYPE_AS(func, PrimFuncNode); Finder finder(self, name); finder(prim_func->body); return std::move(finder.results_); diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc index 148b3ee033c3..b4e40fa120fe 100644 --- a/src/tir/schedule/primitive/layout_transformation.cc +++ b/src/tir/schedule/primitive/layout_transformation.cc @@ -134,7 +134,7 @@ class BufferIsSubregionError : public ScheduleError { void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_index, BufferIndexType buffer_index_type, const IndexMap& index_map) { - const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref); + const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref); Buffer old_buffer = GetNthAccessBuffer(self, GetRef(block_ptr), buffer_index, buffer_index_type); Optional defining_site_sref; @@ -147,7 +147,7 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_ StmtSRef scope_sref = defining_site_sref.defined() ? defining_site_sref.value() : GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false); - const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref); + const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_sref); // Step 1: Infer the shape of the new buffer ObjectPtr new_buffer_node = make_object(*(old_buffer.get())); @@ -344,7 +344,7 @@ class OpaqueNewIterTypeError : public ScheduleError { void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref, const IndexMap& index_map) { - const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref); + const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref); const Block& block = GetRef(block_ptr); arith::Analyzer analyzer; @@ -489,7 +489,7 @@ class BufferAxisSeparatorMutator : private ReplaceBufferMutator { void SetAxisSeparator(ScheduleState self, const StmtSRef& block_sref, int buffer_index, BufferIndexType buffer_index_type, const Array& axis_separators) { - const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref); + const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref); Buffer old_buffer = GetNthAccessBuffer(self, GetRef(block_ptr), buffer_index, buffer_index_type); Optional defining_site_sref; @@ -502,7 +502,7 @@ void SetAxisSeparator(ScheduleState self, const StmtSRef& block_sref, int buffer StmtSRef scope_sref = defining_site_sref.defined() ? defining_site_sref.value() : GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false); - const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref); + const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_sref); // Step 1: Check and update axis_separators of the buffer. Buffer new_buffer = old_buffer; diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc index f1b6f46e1b8f..2db3eb902aba 100644 --- a/src/tir/schedule/primitive/loop_transformation.cc +++ b/src/tir/schedule/primitive/loop_transformation.cc @@ -87,7 +87,7 @@ class IterMapSimplifyBlockBinding : public StmtExprMutator { bool preserve_unit_iters) { Map loop_var2extent; for (const StmtSRef& sref : loop_srefs) { - const ForNode* loop = TVM_SREF_TO_FOR(loop, sref); + const ForNode* loop = TVM_SREF_TO_FOR(sref); loop_var2extent.Set(loop->loop_var, Range::FromMinExtent(loop->min, loop->extent)); } return Downcast(IterMapSimplifyBlockBinding(opaque_blocks, std::move(loop_var2extent), @@ -389,7 +389,7 @@ Array Split(ScheduleState self, const StmtSRef& loop_sref, const Array // - The execution order has not changed. (The block executes with the same args and the same // order with before. // Step 1. Check correctness - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); if (!loop->annotations.empty() || loop->thread_binding.defined()) { throw HasAnnotationOrThreadBindingError(self->mod, GetRef(loop)); } @@ -445,7 +445,7 @@ Array Split(ScheduleState self, const StmtSRef& loop_sref, const Array result_srefs.reserve(n); for (int i = 0; i < n; i++) { result_srefs.push_back(self->stmt2ref.at(new_stmt.get())); - const ForNode* outer_loop = TVM_TYPE_AS(outer_loop, new_stmt, ForNode); + const ForNode* outer_loop = TVM_TYPE_AS(new_stmt, ForNode); new_stmt = outer_loop->body; } return result_srefs; @@ -464,7 +464,7 @@ StmtSRef Fuse(ScheduleState self, const Array& loop_srefs, bool preser std::unordered_set outer_loop_vars; // Step 1. check correctness for (const StmtSRef& sref : loop_srefs) { - const ForNode* loop = TVM_SREF_TO_FOR(loop, sref); + const ForNode* loop = TVM_SREF_TO_FOR(sref); if (!loop->annotations.empty() || loop->thread_binding.defined()) { throw HasAnnotationOrThreadBindingError(self->mod, GetRef(loop)); } @@ -554,7 +554,7 @@ std::unordered_set CollectLoopsIntoSet( for (const StmtSRef& loop_sref : ordered_loop_srefs) { auto inserted = loop_srefs.insert(loop_sref.get()); if (!inserted.second) { - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); throw LoopMultiAppearanceError(self->mod, GetRef(loop)); } } diff --git a/src/tir/schedule/primitive/reduction.cc b/src/tir/schedule/primitive/reduction.cc index ad9043e4f2db..7a4ace736e48 100644 --- a/src/tir/schedule/primitive/reduction.cc +++ b/src/tir/schedule/primitive/reduction.cc @@ -123,7 +123,7 @@ class LoopHeightError : public ScheduleError { // loop_var of a higher loop shouldn't contain loop var const Var& loop_var = higher_loop->StmtAs()->loop_var; if (UsesVar(binding, [v = loop_var.get()](const VarNode* var) { return var == v; })) { - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); throw LoopHeightError(mod, GetRef(loop), GetRef(block)); } } @@ -183,8 +183,8 @@ StmtSRef DecomposeReduction(ScheduleState self, const StmtSRef& block_sref, * - generate corresponding init block and update block */ // Condition Checks and Information Collection - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); // Get the outer loops from high to low Array loops = GetLoops(block_sref); const BlockRealizeNode* realize = GetBlockRealize(self, block_sref).get(); @@ -264,7 +264,7 @@ StmtSRef DecomposeReduction(ScheduleState self, const StmtSRef& block_sref, std::unordered_map loop_var_map; Stmt body = BlockRealize(init_realize); for (int i : chosen_loops) { - const ForNode* old_loop = TVM_SREF_TO_FOR(old_loop, loops[i]); + const ForNode* old_loop = TVM_SREF_TO_FOR(loops[i]); // Create a new equivalent to the chosen loop Var old_loop_var = old_loop->loop_var; Var new_loop_var = old_loop_var.copy_with_suffix("_init"); @@ -277,7 +277,7 @@ StmtSRef DecomposeReduction(ScheduleState self, const StmtSRef& block_sref, } body = Substitute(body, loop_var_map); // Step 6. Mutate IR - const BlockNode* old_scope_root = TVM_SREF_TO_BLOCK(old_scope_root, scope_root_sref); + const BlockNode* old_scope_root = TVM_SREF_TO_BLOCK(scope_root_sref); Block new_scope_root{nullptr}; Block new_reduction_block{nullptr}; std::tie(new_scope_root, new_reduction_block) = DecomposeReductionBlockReplacer::Replace( @@ -1013,7 +1013,7 @@ StmtSRef RFactor(ScheduleState self, const StmtSRef& rf_loop_sref, int factor_ax StmtSRef scope_root = GetScopeRoot(self, block_sref, // /*require_stage_pipeline=*/true); CheckReductionBlock(self, block_sref, scope_root); - const ForNode* rf_loop = TVM_SREF_TO_FOR(rf_loop, rf_loop_sref); + const ForNode* rf_loop = TVM_SREF_TO_FOR(rf_loop_sref); if (rf_loop->kind != ForKind::kSerial) { throw NotSerialLoopKindError(self->mod, GetRef(rf_loop)); } diff --git a/src/tir/schedule/primitive/sampling.cc b/src/tir/schedule/primitive/sampling.cc index 1961565aac75..52b5add2bc9e 100644 --- a/src/tir/schedule/primitive/sampling.cc +++ b/src/tir/schedule/primitive/sampling.cc @@ -311,7 +311,7 @@ std::vector SamplePerfectTile( support::LinearCongruentialEngine::TRandState* rand_state, // const tir::StmtSRef& loop_sref, int32_t n_splits, int32_t max_innermost_factor, Optional>* decision) { - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); const int64_t* extent = GetLoopIntExtent(loop); std::vector result; if (extent == nullptr) { diff --git a/src/tir/schedule/state.cc b/src/tir/schedule/state.cc index 07481ddb19e3..15d0e08ddc2c 100644 --- a/src/tir/schedule/state.cc +++ b/src/tir/schedule/state.cc @@ -208,7 +208,7 @@ class BlockInfoCollector : private StmtVisitor { if (is_root_block) { // If the block doesn't have outer loops and BlockRealize, // then we set the affine binding flag as true only if the block has no block vars - const BlockNode* block = TVM_SREF_TO_BLOCK(block, scope_root); + const BlockNode* block = TVM_SREF_TO_BLOCK(scope_root); if (block->iter_vars.empty()) info.affine_binding = true; } else { info.affine_binding = @@ -233,7 +233,7 @@ class BlockInfoCollector : private StmtVisitor { block_reads_unbound.reserve(child_block_srefs.size()); block_writes_unbound.reserve(child_block_srefs.size()); for (const StmtSRef& block_sref : child_block_srefs) { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); Map binding = GetBindings(block2realize_.at(block)); // Step 1.1. Unbind read regions Array reads; @@ -254,7 +254,7 @@ class BlockInfoCollector : private StmtVisitor { for (const auto& kv : info.scope->dst2deps) { const StmtSRef& consumer_block_sref = kv.first; const Array& deps = kv.second; - const BlockNode* consumer_block = TVM_SREF_TO_BLOCK(consumer_block, consumer_block_sref); + const BlockNode* consumer_block = TVM_SREF_TO_BLOCK(consumer_block_sref); const BlockRealize& consumer_realize = block2realize_.at(consumer_block); bool& region_cover = self_->block_info.at(consumer_block_sref).region_cover = true; // Step 2.1. Extract the path to the scope root @@ -851,7 +851,7 @@ class ChildReplacer : private StmtMutator { } else if (const auto* realize = stmt.as()) { // Case 2. stmt is BlockRealize, src_stmt is Block if (realize->block.get() == src_stmt) { - const auto* tgt_block = TVM_TYPE_AS(tgt_block, tgt_stmt_, BlockNode); + const auto* tgt_block = TVM_TYPE_AS(tgt_stmt_, BlockNode); ObjectPtr new_realize = make_object(*realize); new_realize->block = GetRef(tgt_block); new_stmt = BlockRealize(std::move(new_realize)); @@ -1044,9 +1044,9 @@ void ScheduleStateNode::Replace(const tir::StmtSRef& _src_sref, const Stmt& tgt_ // If `g_func` was unique, after the 3 lines above: // `ref_new_func` points to the same unique function that `g_func` points to // Update the body of the function the sref belongs to Assign - const auto* realize = TVM_TYPE_AS(realize, g_func->body, BlockRealizeNode); + const auto* realize = TVM_TYPE_AS(g_func->body, BlockRealizeNode); // Make `child_tgt_stmt` the root block - const auto* child_block = TVM_TYPE_AS(child_block, child_tgt_stmt, BlockNode); + const auto* child_block = TVM_TYPE_AS(child_tgt_stmt, BlockNode); ObjectPtr new_realize = make_object(*realize); new_realize->block = GetRef(child_block); new_func->body = BlockRealize(std::move(new_realize)); @@ -1078,7 +1078,7 @@ void ScheduleStateNode::DebugVerify() const { /**************** BlockInfo-related ****************/ BlockInfo ScheduleStateNode::GetBlockInfo(const StmtSRef& block_sref) const { - const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref); + TVM_SREF_TO_BLOCK(block_sref); auto it = this->block_info.find(block_sref); CHECK(it != this->block_info.end()) << "IndexError: Cannot find the corresponding BlockScope to the block sref:\n" diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc index 1c21d770db30..1ebaf202d487 100644 --- a/src/tir/schedule/transform.cc +++ b/src/tir/schedule/transform.cc @@ -36,7 +36,7 @@ Block WithAnnotation(const BlockNode* block, const String& attr_key, const Objec Buffer WithScope(const Buffer& buffer, const String& scope) { ObjectPtr new_buffer = make_object(*buffer.get()); ObjectPtr new_var = make_object(*buffer->data.get()); - const auto* ptr_type = TVM_TYPE_AS(ptr_type, buffer->data->type_annotation, PointerTypeNode); + const auto* ptr_type = TVM_TYPE_AS(buffer->data->type_annotation, PointerTypeNode); new_var->type_annotation = PointerType(ptr_type->element_type, scope); new_buffer->data = Var(new_var->name_hint + "_" + scope, new_var->type_annotation); new_buffer->name = buffer->name + "_" + scope; @@ -253,8 +253,8 @@ void LeafBlockRemovalPlan(const ScheduleState& self, const StmtSRef& leaf_block_ } } ICHECK(sref != nullptr && sref->stmt != nullptr); - const auto* leaf_block = TVM_SREF_TO_BLOCK(leaf_block, leaf_block_sref); - const auto* scope_block = TVM_SREF_TO_BLOCK(scope_block, sref); + const auto* leaf_block = TVM_SREF_TO_BLOCK(leaf_block_sref); + const auto* scope_block = TVM_SREF_TO_BLOCK(sref); throw OnlyLeafError(self->mod, GetRef(leaf_block), GetRef(scope_block)); } diff --git a/src/tir/schedule/utils.h b/src/tir/schedule/utils.h index 3db80989ae10..c289309acc2d 100644 --- a/src/tir/schedule/utils.h +++ b/src/tir/schedule/utils.h @@ -62,25 +62,35 @@ namespace tir { /*! * \brief A helper macro to convert an sref to the block it points to, - * throwing an internal error if downcasting fails - * \param Result The result variable, used for checking + * + * Throws an internal error if downcasting fails. The variable name + * in the parent scope is used for the error message. + * * \param SRef The SRef to be cast */ -#define TVM_SREF_TO_BLOCK(Result, SRef) \ - TVM_SREF_AS_OR_ERR(Result, SRef, ::tvm::tir::BlockNode) \ - << "TypeError: Expects StmtSRef `" << #SRef \ - << "` points to `Block`, but gets: " << (SRef->stmt ? SRef->stmt->GetTypeKey() : "None") +#define TVM_SREF_TO_BLOCK(SRef) \ + [&]() { \ + auto result = TVM_SREF_AS_OR_ERR(result, (SRef), ::tvm::tir::BlockNode) \ + << "TypeError: Expects StmtSRef `" << #SRef << "` points to `Block`, but gets: " \ + << ((SRef)->stmt ? (SRef)->stmt->GetTypeKey() : "None"); \ + return result; \ + }() /*! - * \brief A helper macro to convert an sref to the for-loop it points to, - * throwing an internal error if downcasting fails - * \param Result The name of the result variable, used for checking + * \brief A helper macro to convert an sref to the for-loop it points to + * + * Throws an internal error if downcasting fails. The variable name + * in the parent scope is used for the error message. + * * \param SRef The SRef to be cast */ -#define TVM_SREF_TO_FOR(Result, SRef) \ - TVM_SREF_AS_OR_ERR(Result, SRef, ::tvm::tir::ForNode) \ - << "TypeError: Expects StmtSRef `" << #SRef \ - << "` points to `Loop`, but gets: " << (SRef->stmt ? SRef->stmt->GetTypeKey() : "None") +#define TVM_SREF_TO_FOR(SRef) \ + [&]() { \ + auto result = TVM_SREF_AS_OR_ERR(result, (SRef), ::tvm::tir::ForNode) \ + << "TypeError: Expects StmtSRef `" << #SRef << "` points to `Loop`, but gets: " \ + << ((SRef)->stmt ? (SRef)->stmt->GetTypeKey() : "None"); \ + return result; \ + }() /*! * \brief Downcast a TVM ObjectRef to its corresponding container using `ObjectRef::as`, @@ -100,10 +110,13 @@ namespace tir { * \param From The ObjectRef to be downcast * \param Type The type to be downcast to */ -#define TVM_TYPE_AS(Result, From, Type) \ - TVM_TYPE_AS_OR_ERR(Result, From, Type) \ - << "TypeError: Expects `" << #From << "` to have type `" << Type::_type_key \ - << "`, but gets: " << (From.defined() ? From->GetTypeKey() : "None") +#define TVM_TYPE_AS(From, Type) \ + [&]() { \ + auto result = TVM_TYPE_AS_OR_ERR(result, (From), Type) \ + << "TypeError: Expects `" << #From << "` to have type `" << Type::_type_key \ + << "`, but gets: " << ((From).defined() ? (From)->GetTypeKey() : "None"); \ + return result; \ + }() /*! * \brief Convert an array of loop StmtSRefs to an array of loops @@ -114,7 +127,7 @@ inline Array LoopSRefs2Loops(const Array& loop_srefs) { Array loops; loops.reserve(loop_srefs.size()); for (StmtSRef loop_sref : loop_srefs) { - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); loops.push_back(GetRef(loop)); } return loops; @@ -264,7 +277,7 @@ inline const int64_t* GetLoopIntExtent(const ForNode* loop) { return as_const_in * \return The extent of the loop, nullptr if the extent is not constant */ inline const int64_t* GetLoopIntExtent(const StmtSRef& loop_sref) { - const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref); + const ForNode* loop = TVM_SREF_TO_FOR(loop_sref); return as_const_int(loop->extent); } From 2e83e03b2c57f1e65938d7da48a48296c781f7a1 Mon Sep 17 00:00:00 2001 From: masahi Date: Sat, 27 Aug 2022 04:37:32 +0900 Subject: [PATCH 058/704] [CI] Update Hexagon image to install boost (#12613) The new image has xgboost installed, which I need for https://github.com/apache/tvm/pull/12587 Validated in https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/ci-docker-staging/279/pipeline --- Jenkinsfile | 4 ++-- ci/jenkins/Jenkinsfile.j2 | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 8c1ce9ed5020..3278e83098b7 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-08-19T15:38:38.311410 +// Generated at 2022-08-26T15:09:39.104767 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -57,7 +57,7 @@ ci_wasm = 'tlcpack/ci-wasm:20220810-060142-fae79bbc3' ci_i386 = 'tlcpack/ci-i386:20220810-060142-fae79bbc3' ci_cortexm = 'tlcpack/ci-cortexm:20220810-060142-fae79bbc3' ci_arm = 'tlcpack/ci-arm:20220810-060142-fae79bbc3' -ci_hexagon = 'tlcpack/ci-hexagon:20220810-060142-fae79bbc3' +ci_hexagon = 'tlcpack/ci-hexagon:20220825-145056-fb7cf97f' ci_riscv = 'tlcpack/ci-riscv:20220810-060142-fae79bbc3' // <--- End of regex-scanned config. diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2 index be2776c6d9e3..c932431a44a1 100644 --- a/ci/jenkins/Jenkinsfile.j2 +++ b/ci/jenkins/Jenkinsfile.j2 @@ -59,7 +59,7 @@ ci_wasm = 'tlcpack/ci-wasm:20220810-060142-fae79bbc3' ci_i386 = 'tlcpack/ci-i386:20220810-060142-fae79bbc3' ci_cortexm = 'tlcpack/ci-cortexm:20220810-060142-fae79bbc3' ci_arm = 'tlcpack/ci-arm:20220810-060142-fae79bbc3' -ci_hexagon = 'tlcpack/ci-hexagon:20220810-060142-fae79bbc3' +ci_hexagon = 'tlcpack/ci-hexagon:20220825-145056-fb7cf97f' ci_riscv = 'tlcpack/ci-riscv:20220810-060142-fae79bbc3' // <--- End of regex-scanned config. From 23e794422a66ccfca8d58435e341c2af58f505e2 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Fri, 26 Aug 2022 15:59:53 -0500 Subject: [PATCH 059/704] Replace '> >' in templates with >>, NFC (#12615) The problem with greedy lexing of >> as an operator was solved in C++11, and now templates no longer require spaces between >'s. --- docs/arch/convert_layout.rst | 10 +++--- docs/arch/inferbound.rst | 4 +-- .../how_to/relay_bring_your_own_codegen.rst | 2 +- include/tvm/auto_scheduler/feature.h | 8 ++--- include/tvm/relay/attrs/image.h | 14 ++++---- include/tvm/runtime/module.h | 2 +- include/tvm/support/span.h | 2 +- include/tvm/te/operation.h | 2 +- include/tvm/topi/detail/extern.h | 2 +- include/tvm/topi/transform.h | 2 +- .../native/org_apache_tvm_native_c_api.cc | 4 +-- src/arith/analyzer.cc | 2 +- src/autotvm/touch_extractor.cc | 14 ++++---- src/contrib/ethosu/cascader/propagator.cc | 8 ++--- src/contrib/ethosu/cascader/propagator.h | 6 ++-- src/ir/span.cc | 2 +- src/node/reflection.cc | 2 +- src/printer/meta_data.h | 2 +- src/relay/analysis/dependency_graph.cc | 4 +-- src/relay/ir/transform.cc | 2 +- src/relay/transforms/convert_sparse_dense.cc | 8 ++--- src/relay/transforms/fuse_ops.cc | 2 +- src/relay/transforms/let_list.h | 2 +- src/relay/transforms/partial_eval.cc | 2 +- src/relay/transforms/type_infer.cc | 4 +-- src/runtime/contrib/ethosn/ethosn_device.cc | 6 ++-- src/runtime/graph_executor/graph_executor.cc | 4 +-- src/runtime/metal/metal_common.h | 4 +-- src/runtime/thread_pool.cc | 2 +- src/runtime/threading_backend.cc | 2 +- src/runtime/vm/pooled_allocator.h | 2 +- src/target/source/codegen_vhls.cc | 2 +- src/te/operation/compute_op.cc | 8 ++--- src/te/operation/compute_op.h | 4 +-- src/te/operation/tensor_compute_op.cc | 13 ++++--- src/te/operation/tensorize.cc | 29 ++++++++-------- src/te/schedule/graph.h | 6 ++-- src/te/schedule/schedule_dataflow_rewrite.cc | 2 +- src/tir/ir/buffer.cc | 8 ++--- src/tir/transforms/coproc_sync.cc | 34 +++++++++---------- src/tir/transforms/inject_double_buffer.cc | 4 +-- src/tir/transforms/inject_virtual_thread.cc | 2 +- src/tir/transforms/ir_utils.h | 2 +- src/tir/transforms/make_packed_api.cc | 6 ++-- src/tir/transforms/storage_access.h | 2 +- src/tir/transforms/storage_rewrite.cc | 4 +-- 46 files changed, 128 insertions(+), 130 deletions(-) diff --git a/docs/arch/convert_layout.rst b/docs/arch/convert_layout.rst index 53038e9605e8..51917fce44df 100644 --- a/docs/arch/convert_layout.rst +++ b/docs/arch/convert_layout.rst @@ -150,10 +150,10 @@ First example is for layout agnostic operators. These operators do not have any // .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout); // Take arbitrary input layouts and copy to outputs. - inline Array > ElemwiseArbitraryLayout(const Attrs& attrs, - const Array& new_in_layouts, - const Array& old_in_layouts, - const Array> &old_in_shapes) { + inline Array> ElemwiseArbitraryLayout(const Attrs& attrs, + const Array& new_in_layouts, + const Array& old_in_layouts, + const Array> &old_in_shapes) { Layout ret; if (new_in_layouts.defined()) { @@ -168,7 +168,7 @@ First example is for layout agnostic operators. These operators do not have any } } - return Array >{Array(old_in_layouts.size(), ret), {ret}}; + return Array>{Array(old_in_layouts.size(), ret), {ret}}; } diff --git a/docs/arch/inferbound.rst b/docs/arch/inferbound.rst index 9c78a9da7440..cc516359bdba 100644 --- a/docs/arch/inferbound.rst +++ b/docs/arch/inferbound.rst @@ -280,7 +280,7 @@ Phase 3: Propagate IntSets to consumer's input tensors /* * Input: Map dom_map: consumer root -> IntSet - * Output: Map tmap: output tensor -> vector > + * Output: Map tmap: output tensor -> vector> */ Note that the consumer's input tensors are output tensors of the stage InferBound is working on. So by establishing information about the consumer's input tensors, we actually obtain information about the stage's output tensors too: the consumers require certain regions of these tensors to be computed. This information can then be propagated through the rest of the stage, eventually obtaining Ranges for the stage's root_iter_vars by the end of Phase 4. @@ -306,7 +306,7 @@ Phase 4: Consolidate across all consumers .. code:: cpp /* - * Input: Map tmap: output tensor -> vector > + * Input: Map tmap: output tensor -> vector> * Output: Map rmap: rmap is populated for all of the stage's root_iter_vars */ diff --git a/docs/dev/how_to/relay_bring_your_own_codegen.rst b/docs/dev/how_to/relay_bring_your_own_codegen.rst index 304bd016dec2..c106bb2a6372 100644 --- a/docs/dev/how_to/relay_bring_your_own_codegen.rst +++ b/docs/dev/how_to/relay_bring_your_own_codegen.rst @@ -676,7 +676,7 @@ Again, we first define a customized runtime class as follows. The class has to b /* \brief The subgraph that being processed. */ std::string curr_subgraph_; /*! \brief A simple graph from subgraph id to node entries. */ - std::map > graph_; + std::map> graph_; /* \brief A simple pool to contain the tensor for each node in the graph. */ std::vector data_entry_; /* \brief A mapping from node id to op name. */ diff --git a/include/tvm/auto_scheduler/feature.h b/include/tvm/auto_scheduler/feature.h index 71d00f249210..a8b88b7f11f9 100644 --- a/include/tvm/auto_scheduler/feature.h +++ b/include/tvm/auto_scheduler/feature.h @@ -70,7 +70,7 @@ void GetPerStoreFeatureName(int max_n_bufs, std::vector* ret); */ void GetPerStoreFeaturesFromStates(const Array& states, const SearchTask& task, int skip_first_n_feature_extraction, int max_n_bufs, - std::vector >* features); + std::vector>* features); /*! * \brief Get per-store feature from states of different tasks @@ -83,7 +83,7 @@ void GetPerStoreFeaturesFromStates(const Array& states, const SearchTask& */ void GetPerStoreFeaturesFromStates(const Array& states, const std::vector& tasks, int skip_first_n_feature_extraction, int max_n_bufs, - std::vector >* features); + std::vector>* features); /*! * \brief Get per-store features from a log file @@ -96,7 +96,7 @@ void GetPerStoreFeaturesFromStates(const Array& states, const std::vector * \param task_ids The task ids for all states */ void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int max_n_bufs, - std::vector >* features, + std::vector>* features, std::vector* normalized_throughputs, std::vector* task_ids); @@ -114,7 +114,7 @@ void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int void GetPerStoreFeaturesFromMeasurePairs(const Array& inputs, const Array& results, int skip_first_n_feature_extraction, int max_n_bufs, - std::vector >* features, + std::vector>* features, std::vector* normalized_throughputs, std::vector* task_ids); diff --git a/include/tvm/relay/attrs/image.h b/include/tvm/relay/attrs/image.h index e0ee6dc748c2..43510ea68501 100644 --- a/include/tvm/relay/attrs/image.h +++ b/include/tvm/relay/attrs/image.h @@ -46,9 +46,9 @@ struct Resize1DAttrs : public tvm::AttrsNode { DataType out_dtype; TVM_DECLARE_ATTRS(Resize1DAttrs, "relay.attrs.Resize1DAttrs") { - TVM_ATTR_FIELD(size).set_default(NullValue >()).describe("Output Size."); + TVM_ATTR_FIELD(size).set_default(NullValue>()).describe("Output Size."); TVM_ATTR_FIELD(roi) - .set_default(NullValue >()) + .set_default(NullValue>()) .describe("Region of Interest for coordinate transformation mode 'tf_crop_and_resize'"); TVM_ATTR_FIELD(layout).set_default("NCW").describe( "Dimension ordering of input data. Can be 'NCW', 'NWC', etc." @@ -99,9 +99,9 @@ struct Resize2DAttrs : public tvm::AttrsNode { DataType out_dtype; TVM_DECLARE_ATTRS(Resize2DAttrs, "relay.attrs.Resize2DAttrs") { - TVM_ATTR_FIELD(size).set_default(NullValue >()).describe("Output Size."); + TVM_ATTR_FIELD(size).set_default(NullValue>()).describe("Output Size."); TVM_ATTR_FIELD(roi) - .set_default(NullValue >()) + .set_default(NullValue>()) .describe("Region of Interest for coordinate transformation mode 'tf_crop_and_resize'"); TVM_ATTR_FIELD(layout).set_default("NCHW").describe( "Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc." @@ -152,9 +152,9 @@ struct Resize3DAttrs : public tvm::AttrsNode { DataType out_dtype; TVM_DECLARE_ATTRS(Resize3DAttrs, "relay.attrs.Resize3DAttrs") { - TVM_ATTR_FIELD(size).set_default(NullValue >()).describe("Output Size."); + TVM_ATTR_FIELD(size).set_default(NullValue>()).describe("Output Size."); TVM_ATTR_FIELD(roi) - .set_default(NullValue >()) + .set_default(NullValue>()) .describe("Region of Interest for coordinate transformation mode 'tf_crop_and_resize'"); TVM_ATTR_FIELD(layout).set_default("NCDHW").describe( "Dimension ordering of input data. Can be 'NCDHW', 'NDHWC', etc." @@ -200,7 +200,7 @@ struct CropAndResizeAttrs : public tvm::AttrsNode { DataType out_dtype; TVM_DECLARE_ATTRS(CropAndResizeAttrs, "relay.attrs.CropAndResizeAttrs") { - TVM_ATTR_FIELD(crop_size).set_default(NullValue >()).describe("Target Size."); + TVM_ATTR_FIELD(crop_size).set_default(NullValue>()).describe("Target Size."); TVM_ATTR_FIELD(layout).set_default("NCHW").describe( "Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc." "'N', 'C', 'H', 'W' stands for batch, channel, height, and width" diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h index 9d139c9feff3..a54f98a558f3 100644 --- a/include/tvm/runtime/module.h +++ b/include/tvm/runtime/module.h @@ -234,7 +234,7 @@ class TVM_DLL ModuleNode : public Object { private: /*! \brief Cache used by GetImport */ - std::unordered_map > import_cache_; + std::unordered_map> import_cache_; std::mutex mutex_; }; diff --git a/include/tvm/support/span.h b/include/tvm/support/span.h index 603fb531f43a..689a48dee788 100644 --- a/include/tvm/support/span.h +++ b/include/tvm/support/span.h @@ -68,7 +68,7 @@ class Span { inline bool operator!=(iterator_base other) { return !(*this == other); } - template ::value> > + template ::value>> inline operator iterator_base() const { return iterator_base(ptr_, end_); } diff --git a/include/tvm/te/operation.h b/include/tvm/te/operation.h index e91a0930f37b..2c50f3c3157b 100644 --- a/include/tvm/te/operation.h +++ b/include/tvm/te/operation.h @@ -47,7 +47,7 @@ struct TensorDom { // constructor explicit TensorDom(int ndim) : data(ndim) {} /*! \brief The domain data */ - std::vector > data; + std::vector> data; }; /*! diff --git a/include/tvm/topi/detail/extern.h b/include/tvm/topi/detail/extern.h index 2561f8d1ca27..dee4bf70a729 100644 --- a/include/tvm/topi/detail/extern.h +++ b/include/tvm/topi/detail/extern.h @@ -75,7 +75,7 @@ using FExtern = std::function, Array)>; * be one output Tensor for each element of out_shapes, with dtype equal to the corresponding * element of out_types. */ -inline Array make_extern(const Array >& out_shapes, +inline Array make_extern(const Array>& out_shapes, const std::vector& out_types, const Array& inputs, FExtern fextern, std::string name, std::string tag, ::tvm::Map attrs) { diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h index 7accbf86912d..4c96ed42f6e9 100644 --- a/include/tvm/topi/transform.h +++ b/include/tvm/topi/transform.h @@ -592,7 +592,7 @@ inline Array split(const Tensor& x, Array split_indices, int a begin_ids.push_back(idx); } - Array > out_shapes; + Array> out_shapes; for (size_t i = 0; i < begin_ids.size(); ++i) { PrimExpr out_axis_size; if (i == begin_ids.size() - 1) { diff --git a/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc b/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc index f7be0cf80eb0..f86191d45bbc 100644 --- a/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc +++ b/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc @@ -42,8 +42,8 @@ struct TVMFuncArgsThreadLocalEntry { std::vector tvmFuncArgValues; std::vector tvmFuncArgTypes; // for later release - std::vector > tvmFuncArgPushedStrs; - std::vector > tvmFuncArgPushedBytes; + std::vector> tvmFuncArgPushedStrs; + std::vector> tvmFuncArgPushedBytes; }; typedef dmlc::ThreadLocalStore TVMFuncArgsThreadLocalStore; diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc index f32c9b2ff4cf..ad52a6578b24 100644 --- a/src/arith/analyzer.cc +++ b/src/arith/analyzer.cc @@ -186,7 +186,7 @@ TVM_REGISTER_GLOBAL("arith.CreateAnalyzer").set_body([](TVMArgs args, TVMRetValu return PackedFunc([self](TVMArgs args, TVMRetValue* ret) { // can't use make_shared due to noexcept(false) decl in destructor, // see https://stackoverflow.com/a/43907314 - auto ctx = std::shared_ptr >( + auto ctx = std::shared_ptr>( new With(self.get(), args[0])); auto fexit = [ctx](TVMArgs, TVMRetValue*) mutable { ctx.reset(); }; *ret = PackedFunc(fexit); diff --git a/src/autotvm/touch_extractor.cc b/src/autotvm/touch_extractor.cc index 10ead718bae2..dd3cf88f7bf6 100644 --- a/src/autotvm/touch_extractor.cc +++ b/src/autotvm/touch_extractor.cc @@ -220,7 +220,7 @@ void TouchExtractor::ExitMem_() {} * \note If you want to flatten these features as the input of your model, * You can use the faster one GetItervarFeatureFlatten below. */ -void GetItervarFeature(Stmt stmt, bool take_log, Array > >* ret_feature) { +void GetItervarFeature(Stmt stmt, bool take_log, Array>>* ret_feature) { // extract TouchExtractor touch_analyzer; touch_analyzer.Analyze(stmt); @@ -248,7 +248,7 @@ void GetItervarFeature(Stmt stmt, bool take_log, Array > > // serialize for front end for (auto var : vars) { - Array > feature_row; + Array> feature_row; ItervarFeature& fea = touch_analyzer.itervar_map[var]; feature_row.push_back(Array{tvm::tir::StringImm("_itervar_"), var}); @@ -389,10 +389,10 @@ void GetCurveSampleFeatureFlatten(Stmt stmt, int sample_n, std::vector* r }); int max_depth = 0; - std::map > reuse_curve; - std::map > count_curve; - std::map > topdown_curve; - std::map > bottomup_curve; + std::map> reuse_curve; + std::map> count_curve; + std::map> topdown_curve; + std::map> bottomup_curve; std::set innermost_buffers; std::set added; @@ -485,7 +485,7 @@ TVM_REGISTER_GLOBAL("autotvm.feature.GetItervarFeature") .set_body([](TVMArgs args, TVMRetValue* ret) { Stmt stmt = args[0]; bool take_log = args[1]; - Array > > ret_feature; + Array>> ret_feature; GetItervarFeature(stmt, take_log, &ret_feature); diff --git a/src/contrib/ethosu/cascader/propagator.cc b/src/contrib/ethosu/cascader/propagator.cc index 25b711a53d05..ca8aaf6e27d5 100644 --- a/src/contrib/ethosu/cascader/propagator.cc +++ b/src/contrib/ethosu/cascader/propagator.cc @@ -34,7 +34,7 @@ namespace ethosu { namespace cascader { void PropagatorNode::VisitAttrs(AttrVisitor* v) { - Array > tmp_transform; + Array> tmp_transform; for (const auto& vec : transform_) { tmp_transform.push_back(make_array(vec)); } @@ -43,7 +43,7 @@ void PropagatorNode::VisitAttrs(AttrVisitor* v) { v->Visit("_offset", &tmp_arr); } -Propagator::Propagator(const std::vector >& transform, +Propagator::Propagator(const std::vector>& transform, const std::vector& offset) { auto n = make_object(); size_t rows = transform.size(); @@ -102,8 +102,8 @@ StripeConfig PropagatorNode::propagate(const StripeConfig& stripe_config) const } TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.Propagator") - .set_body_typed([](Array > transform, Array offset) { - std::vector > vtransform; + .set_body_typed([](Array> transform, Array offset) { + std::vector> vtransform; for (const auto& vec : transform) { vtransform.push_back(make_vector(vec)); } diff --git a/src/contrib/ethosu/cascader/propagator.h b/src/contrib/ethosu/cascader/propagator.h index 2d4bd0d0154a..3946d0806a0c 100644 --- a/src/contrib/ethosu/cascader/propagator.h +++ b/src/contrib/ethosu/cascader/propagator.h @@ -43,7 +43,7 @@ class PropagatorNode : public Object { void VisitAttrs(AttrVisitor* v); /*! \return The transform matrix to apply to the StripeConfigs */ - const std::vector > GetTransform() const { return transform_; } + const std::vector> GetTransform() const { return transform_; } /*! \return The offset vector to apply to the StripeConfigs */ const std::vector GetOffset() const { return offset_; } /*! \return The number of input dimensions */ @@ -92,7 +92,7 @@ class PropagatorNode : public Object { friend class Propagator; /*! \brief The transform matrix to apply to the StripeConfigs */ - std::vector > transform_; + std::vector> transform_; /*! \brief The offset vector to apply to the StripeConfigs */ std::vector offset_; }; @@ -124,7 +124,7 @@ class PropagatorNode : public Object { */ class Propagator : public ObjectRef { public: - Propagator(const std::vector >& transform, const std::vector& offset); + Propagator(const std::vector>& transform, const std::vector& offset); TVM_DEFINE_OBJECT_REF_METHODS(Propagator, ObjectRef, PropagatorNode); }; diff --git a/src/ir/span.cc b/src/ir/span.cc index 4a26f3a6eb11..e19bef4cb864 100644 --- a/src/ir/span.cc +++ b/src/ir/span.cc @@ -30,7 +30,7 @@ namespace tvm { ObjectPtr GetSourceNameNode(const String& name) { // always return pointer as the reference can change as map re-allocate. // or use another level of indirection by creating a unique_ptr - static std::unordered_map > source_map; + static std::unordered_map> source_map; auto sn = source_map.find(name); if (sn == source_map.end()) { diff --git a/src/node/reflection.cc b/src/node/reflection.cc index a0f83f6cf5ad..aa572e99658c 100644 --- a/src/node/reflection.cc +++ b/src/node/reflection.cc @@ -254,7 +254,7 @@ void NodeListAttrNames(TVMArgs args, TVMRetValue* ret) { Object* self = static_cast(args[0].value().v_handle); auto names = - std::make_shared >(ReflectionVTable::Global()->ListAttrNames(self)); + std::make_shared>(ReflectionVTable::Global()->ListAttrNames(self)); *ret = PackedFunc([names](TVMArgs args, TVMRetValue* rv) { int64_t i = args[0]; diff --git a/src/printer/meta_data.h b/src/printer/meta_data.h index b076ad07caaf..ddf0d78087ee 100644 --- a/src/printer/meta_data.h +++ b/src/printer/meta_data.h @@ -136,7 +136,7 @@ class TextMetaDataContext { private: /*! \brief additional metadata stored in TVM json format */ - std::unordered_map > meta_data_; + std::unordered_map> meta_data_; /*! \brief map from meta data into its string representation */ std::unordered_map meta_repr_; }; diff --git a/src/relay/analysis/dependency_graph.cc b/src/relay/analysis/dependency_graph.cc index 18913ca37562..91711fa4baa8 100644 --- a/src/relay/analysis/dependency_graph.cc +++ b/src/relay/analysis/dependency_graph.cc @@ -56,11 +56,11 @@ class DependencyGraph::Creator : private MixedModeVisitor { } void Depend(DependencyGraph::Node* parent, DependencyGraph::Node* child) { - auto* parent_link = arena_->make >(); + auto* parent_link = arena_->make>(); parent_link->value = parent; child->parents.Push(parent_link); - auto* child_link = arena_->make >(); + auto* child_link = arena_->make>(); child_link->value = child; parent->children.Push(child_link); } diff --git a/src/relay/ir/transform.cc b/src/relay/ir/transform.cc index 1a16cc9becf1..fc1f3a15077e 100644 --- a/src/relay/ir/transform.cc +++ b/src/relay/ir/transform.cc @@ -126,7 +126,7 @@ IRModule FunctionPassNode::operator()(IRModule mod, const PassContext& pass_ctx) IRModule updated_mod = mod->ShallowCopy(); - std::vector > updates; + std::vector> updates; for (const auto& kv : mod->functions) { // only process optimizable Relay Functions if (const auto* function_node = AsOptimizableFunctionNode(kv.second)) { diff --git a/src/relay/transforms/convert_sparse_dense.cc b/src/relay/transforms/convert_sparse_dense.cc index faba366eca49..7053f1301cca 100644 --- a/src/relay/transforms/convert_sparse_dense.cc +++ b/src/relay/transforms/convert_sparse_dense.cc @@ -73,7 +73,7 @@ TVM_REGISTER_GLOBAL("relay.analysis.search_dense_op_weight").set_body_typed(Sear class DenseToSparseDenseMutator : public ExprRewriter { public: DenseToSparseDenseMutator(const Array& weight_name, - const Array >& weight_shape) + const Array>& weight_shape) : dense_op_(Op::Get("nn.dense")), sparse_dense_op_(Op::Get("nn.sparse_dense")) { ICHECK_EQ(weight_name.size(), weight_shape.size()); for (size_t i = 0; i < weight_name.size(); ++i) { @@ -117,11 +117,11 @@ class DenseToSparseDenseMutator : public ExprRewriter { // Cached op const Op& dense_op_; const Op& sparse_dense_op_; - std::unordered_map > target_weights_; + std::unordered_map> target_weights_; }; // class DenseToSparseDenseAlter Expr DenseToSparse(const Expr& e, const Array& weight_name, - const Array >& weight_shape) { + const Array>& weight_shape) { auto rewriter = DenseToSparseDenseMutator(weight_name, weight_shape); return PostOrderRewrite(e, &rewriter); } @@ -129,7 +129,7 @@ Expr DenseToSparse(const Expr& e, const Array& weight_name, namespace transform { Pass DenseToSparse(const Array& weight_name, - const Array >& weight_shape) { + const Array>& weight_shape) { runtime::TypedPackedFunc pass_func = [=](Function f, IRModule m, PassContext pc) { // Remove FreeVar warnings diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc index 1ced0883a14c..dac5dc69ead5 100644 --- a/src/relay/transforms/fuse_ops.cc +++ b/src/relay/transforms/fuse_ops.cc @@ -180,7 +180,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor { graph_.node_map[key] = current; } if (parent != nullptr) { - auto* link = arena_->make >(); + auto* link = arena_->make>(); link->value.node = parent; link->value.pattern = pattern; current->outputs.Push(link); diff --git a/src/relay/transforms/let_list.h b/src/relay/transforms/let_list.h index f449d6c3b011..f908fbcee514 100644 --- a/src/relay/transforms/let_list.h +++ b/src/relay/transforms/let_list.h @@ -145,7 +145,7 @@ class LetList { } private: - std::vector > lets_; + std::vector> lets_; bool used_ = false; }; diff --git a/src/relay/transforms/partial_eval.cc b/src/relay/transforms/partial_eval.cc index fc9922ca03ef..f791192e25c1 100644 --- a/src/relay/transforms/partial_eval.cc +++ b/src/relay/transforms/partial_eval.cc @@ -772,7 +772,7 @@ class PartialEvaluator : public ExprFunctor if (func->HasNonzeroAttr(attr::kPrimitive)) { return ConstEvaluateFunc(func); } - std::vector > free_vars; + std::vector> free_vars; for (const auto& v : FreeVars(func)) { if (v != var) { free_vars.push_back(std::pair(v, env_.Lookup(v))); diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc index 9c01c40517f4..d2eb48073f7d 100644 --- a/src/relay/transforms/type_infer.cc +++ b/src/relay/transforms/type_infer.cc @@ -829,7 +829,7 @@ void EnsureCheckedType(const Expr& e) { AllCheckTypePopulated().VisitExpr(e); } // TODO(@jroesch): Can we optimize this? void AddGlobalTypes(IRModule mod) { - std::vector > updates; + std::vector> updates; for (const auto& it : mod->functions) { // Currently we don't type check TIR. // The inferencer will only check Relay functions @@ -961,7 +961,7 @@ Pass InferType() { // Add all the type annotations to the functions in the model. AddGlobalTypes(mod); - std::vector > updates; + std::vector> updates; for (const auto& it : updated_mod->functions) { // Currently we don't type check TIR. // diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc index 628f99788d16..900ae65afcc3 100644 --- a/src/runtime/contrib/ethosn/ethosn_device.cc +++ b/src/runtime/contrib/ethosn/ethosn_device.cc @@ -87,7 +87,7 @@ void CopyOutput(dl::Buffer* source_buffers[], std::vector* outputs) { } } -void CreateBuffers(std::vector >* fm, +void CreateBuffers(std::vector>* fm, const std::vector& tensors, const std::vector& tensor_sizes, bool input) { for (size_t i = 0; i < tensors.size(); i++) { @@ -118,11 +118,11 @@ bool Inference(tvm::runtime::TVMArgs args, dl::Network* npu, } // Set up input buffers - std::vector > ifm(inputs.size()); + std::vector> ifm(inputs.size()); CreateBuffers(&ifm, inputs, input_sizes, true); // Set up output buffers - std::vector > ofm(outputs.size()); + std::vector> ofm(outputs.size()); CreateBuffers(&ofm, outputs, output_sizes, false); // Raw pointers for the inference diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc index 78e65f6f2319..e3113dbfe54c 100644 --- a/src/runtime/graph_executor/graph_executor.cc +++ b/src/runtime/graph_executor/graph_executor.cc @@ -519,8 +519,8 @@ void GraphExecutor::SetupOpExecs() { } } -std::pair, std::shared_ptr > -GraphExecutor::CreateTVMOp(const TVMOpParam& param, const std::vector& args) { +std::pair, std::shared_ptr> GraphExecutor::CreateTVMOp( + const TVMOpParam& param, const std::vector& args) { std::shared_ptr arg_ptr = std::make_shared(); // setup address. arg_ptr->args = args; diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h index 47a5999fdce9..dad156bcdddc 100644 --- a/src/runtime/metal/metal_common.h +++ b/src/runtime/metal/metal_common.h @@ -133,7 +133,7 @@ class Stream { class MetalWorkspace final : public DeviceAPI { public: // the devices - std::vector > devices; + std::vector> devices; // Warp size constant std::vector warp_size; // Whether it is initialized. @@ -186,7 +186,7 @@ class MetalThreadEntry { /*! \brief The current stream */ std::vector stream; /*! \brief The shared buffer used for copy. */ - std::vector > temp_buffer_; + std::vector> temp_buffer_; /*! \brief workspace pool */ WorkspacePool pool; // constructor diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc index 7744174ec866..665244d3d1bd 100644 --- a/src/runtime/thread_pool.cc +++ b/src/runtime/thread_pool.cc @@ -369,7 +369,7 @@ class ThreadPool { int num_workers_used_; // if or not to exclude worker 0 and use main to run task 0 bool exclude_worker0_{true}; - std::vector > queues_; + std::vector> queues_; std::unique_ptr threads_; }; diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc index 14b5f27dd495..ef1aa69f6455 100644 --- a/src/runtime/threading_backend.cc +++ b/src/runtime/threading_backend.cc @@ -285,7 +285,7 @@ class ThreadGroup::Impl { // is not supported in earlier versions of QuRT. In such cases assume 4. if (threads == 0) threads = 4; #endif - std::vector > max_freqs; + std::vector> max_freqs; for (unsigned int i = 0; i < threads; ++i) { int64_t cur_freq = 0; diff --git a/src/runtime/vm/pooled_allocator.h b/src/runtime/vm/pooled_allocator.h index e5f236983a73..9c11c783011e 100644 --- a/src/runtime/vm/pooled_allocator.h +++ b/src/runtime/vm/pooled_allocator.h @@ -99,7 +99,7 @@ class PooledAllocator final : public Allocator { private: size_t page_size_; std::atomic used_memory_; - std::unordered_map > memory_pool_; + std::unordered_map> memory_pool_; std::recursive_mutex mu_; Device device_; }; diff --git a/src/target/source/codegen_vhls.cc b/src/target/source/codegen_vhls.cc index 9896d8b833f9..4091b64f4524 100644 --- a/src/target/source/codegen_vhls.cc +++ b/src/target/source/codegen_vhls.cc @@ -157,7 +157,7 @@ runtime::Module BuildSDAccel(IRModule mod, Target target) { std::string whole_code = cg.Finish(); // Generate source code for compilation. - Array > kernel_info; + Array> kernel_info; for (auto kv : mod->functions) { ICHECK(kv.second->IsInstance()) << "CodeGenOpenCL: Can only take PrimFunc"; diff --git a/src/te/operation/compute_op.cc b/src/te/operation/compute_op.cc index c3062045939a..7f8facad5568 100644 --- a/src/te/operation/compute_op.cc +++ b/src/te/operation/compute_op.cc @@ -357,10 +357,10 @@ Stmt MakeComputeStmt(const ComputeOpNode* self, const Stage& stage, init = MergeNest(n.init_nest, init); init = Substitute(init, n.init_vmap); // common nest - std::vector > common(n.main_nest.begin(), - n.main_nest.begin() + n.num_common_loop + 1); - std::vector > reduce(n.main_nest.begin() + n.num_common_loop + 1, - n.main_nest.end()); + std::vector> common(n.main_nest.begin(), + n.main_nest.begin() + n.num_common_loop + 1); + std::vector> reduce(n.main_nest.begin() + n.num_common_loop + 1, + n.main_nest.end()); provide = MergeNest(reduce, provide); if (debug_keep_trivial_loop) { provide = MergeNest(common, provide); diff --git a/src/te/operation/compute_op.h b/src/te/operation/compute_op.h index 2661eb976f2e..944334a41fdb 100644 --- a/src/te/operation/compute_op.h +++ b/src/te/operation/compute_op.h @@ -41,13 +41,13 @@ struct ComputeLoopNest { // predicates for the initialize loop std::vector init_predicates; // Initialization nest involved. - std::vector > init_nest; + std::vector> init_nest; // Value map for the init code std::unordered_map init_vmap; // Predicates for the main update loop std::vector main_predicates; // The general loop nest - std::vector > main_nest; + std::vector> main_nest; // Value map for the IterVar. std::unordered_map main_vmap; diff --git a/src/te/operation/tensor_compute_op.cc b/src/te/operation/tensor_compute_op.cc index 262e5a2b97f4..00f751c58a09 100644 --- a/src/te/operation/tensor_compute_op.cc +++ b/src/te/operation/tensor_compute_op.cc @@ -202,7 +202,7 @@ Stmt TensorComputeOpNode::BuildProvide(const Stage& stage, ComputeLoopNest n = ComputeLoopNest::Create(this, stage, dom_map, debug_keep_trivial_loop); if (this->reduce_axis.size() == 0) { - std::vector > nest(n.main_nest.begin(), n.main_nest.begin() + tloc + 1); + std::vector> nest(n.main_nest.begin(), n.main_nest.begin() + tloc + 1); nest.emplace_back(MakeIfNest(n.main_predicates)); ICHECK_EQ(n.init_predicates.size(), 0U); ICHECK(this->intrin->body.defined()) @@ -219,16 +219,15 @@ Stmt TensorComputeOpNode::BuildProvide(const Stage& stage, ICHECK(this->intrin->reduce_update.defined()) << "Reduction update op is not defined"; // Need init and update steps ICHECK_NE(this->reduce_axis.size(), 0U); - std::vector > common(n.main_nest.begin(), - n.main_nest.begin() + n.num_common_loop + 1); - std::vector > update_nest(n.main_nest.begin() + n.num_common_loop + 1, - n.main_nest.begin() + tloc + 1); + std::vector> common(n.main_nest.begin(), + n.main_nest.begin() + n.num_common_loop + 1); + std::vector> update_nest(n.main_nest.begin() + n.num_common_loop + 1, + n.main_nest.begin() + tloc + 1); update_nest.emplace_back(MakeIfNest(n.main_predicates)); if (this->intrin->reduce_init.defined()) { // init nest - std::vector > init_nest(n.init_nest.begin(), - n.init_nest.begin() + tloc + 1); + std::vector> init_nest(n.init_nest.begin(), n.init_nest.begin() + tloc + 1); init_nest.emplace_back(MakeIfNest(n.init_predicates)); Stmt init = MergeNest(output_bind_nest, this->intrin->reduce_init); init = te::Substitute(init, n.init_vmap); diff --git a/src/te/operation/tensorize.cc b/src/te/operation/tensorize.cc index b31b61b739c1..138aeeb37f19 100644 --- a/src/te/operation/tensorize.cc +++ b/src/te/operation/tensorize.cc @@ -42,7 +42,7 @@ using namespace tir; size_t InferTensorizeRegion(const ComputeOpNode* self, const Stage& stage, const std::unordered_map& dom_map, std::unordered_map* out_dom, - std::unordered_map >* in_region) { + std::unordered_map>* in_region) { // Get the bound of the tensorized scope. bool found_point = false; size_t loc_scope = 0; @@ -198,7 +198,7 @@ class TensorIntrinMatcher final : public StmtExprMutator { void Init(const ComputeOpNode* self, const Stage& stage, const std::unordered_map& dom_map, const std::unordered_map& out_dom, - const std::unordered_map >& in_region, const TensorIntrin& intrin, + const std::unordered_map>& in_region, const TensorIntrin& intrin, Map* compute_intrin_iter_space) { ICHECK(self == stage->op.get()); @@ -298,7 +298,7 @@ class TensorIntrinMatcher final : public StmtExprMutator { Array MatchTensorizeBody(const ComputeOpNode* self, const Stage& stage, const std::unordered_map& dom_map, const std::unordered_map& out_dom, - const std::unordered_map >& in_region, + const std::unordered_map>& in_region, const TensorIntrin& intrin, Map* compute_intrin_iter_space) { TensorIntrinMatcher matcher; @@ -314,7 +314,7 @@ void VerifyTensorizeBody(const ComputeOpNode* self, const Stage& stage, const std::unordered_map& value_map, const std::unordered_map& dom_map, const std::unordered_map& out_dom, - const std::unordered_map >& in_region, + const std::unordered_map>& in_region, const TensorIntrin& intrin) { StructuralEqual expr_equal; Map compute_intrin_iter_space; @@ -346,7 +346,7 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage, const std::unordered_map& dom_map, bool debug_keep_trivial_loop) { std::unordered_map out_dom; - std::unordered_map > in_region; + std::unordered_map> in_region; size_t tloc = InferTensorizeRegion(self, stage, dom_map, &out_dom, &in_region); TensorIntrin intrin = stage->iter_var_attrs.at(stage->leaf_iter_vars[tloc])->tensor_intrin; ICHECK(intrin.defined()); @@ -418,7 +418,7 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage, } if (tloc <= n.num_common_loop) { // Do no need to split reduction - std::vector > nest(n.main_nest.begin(), n.main_nest.begin() + tloc + 1); + std::vector> nest(n.main_nest.begin(), n.main_nest.begin() + tloc + 1); nest.emplace_back(MakeIfNest(n.main_predicates)); ICHECK_EQ(n.init_predicates.size(), 0U); ICHECK(intrin->body.defined()) << "Normal store op for intrin " << intrin << " is not defined"; @@ -434,16 +434,15 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage, << "Reduction update op for intrin " << intrin << " is not defined"; // Need init and update steps ICHECK_NE(self->reduce_axis.size(), 0U); - std::vector > common(n.main_nest.begin(), - n.main_nest.begin() + n.num_common_loop + 1); - std::vector > update_nest(n.main_nest.begin() + n.num_common_loop + 1, - n.main_nest.begin() + tloc + 1); + std::vector> common(n.main_nest.begin(), + n.main_nest.begin() + n.num_common_loop + 1); + std::vector> update_nest(n.main_nest.begin() + n.num_common_loop + 1, + n.main_nest.begin() + tloc + 1); update_nest.emplace_back(MakeIfNest(n.main_predicates)); if (intrin->reduce_init.defined()) { // init nest - std::vector > init_nest(n.init_nest.begin(), - n.init_nest.begin() + tloc + 1); + std::vector> init_nest(n.init_nest.begin(), n.init_nest.begin() + tloc + 1); init_nest.emplace_back(MakeIfNest(n.init_predicates)); Stmt init = MergeNest(output_bind_nest, intrin->reduce_init); init = te::Substitute(init, n.init_vmap); @@ -476,17 +475,17 @@ TVM_REGISTER_GLOBAL("test.op.InferTensorizeRegion").set_body([](TVMArgs args, TV Stage stage = args[0]; Map dmap = args[1]; std::unordered_map out_dom; - std::unordered_map > in_region; + std::unordered_map> in_region; ICHECK(stage->op.as()); InferTensorizeRegion(stage->op.as(), stage, as_unordered_map(dmap), &out_dom, &in_region); - *ret = Array{Map(out_dom), Map >(in_region)}; + *ret = Array{Map(out_dom), Map>(in_region)}; }); TVM_REGISTER_GLOBAL("test.op.MatchTensorizeBody").set_body([](TVMArgs args, TVMRetValue* ret) { Stage stage = args[0]; Map out_dom = args[1]; - Map > in_region = args[2]; + Map> in_region = args[2]; TensorIntrin intrin = args[3]; Map vrange; ICHECK(stage->op.as()); diff --git a/src/te/schedule/graph.h b/src/te/schedule/graph.h index bb98ff4b706d..d31473d1b5a0 100644 --- a/src/te/schedule/graph.h +++ b/src/te/schedule/graph.h @@ -38,17 +38,17 @@ namespace te { /*! * \brief data structure of Operation->Tensors it reads */ -using ReadGraph = Map >; +using ReadGraph = Map>; /*! * \brief AttachPath maps op-> a list of IterVar */ -using AttachPath = Map >; +using AttachPath = Map>; /*! * \brief The map between tensor and operation it feeds to. */ -using FeedGraph = std::unordered_map >; +using FeedGraph = std::unordered_map>; /*! * \brief Get read graph of each operation to all the diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc index a8363fd084cd..39243bf2216f 100644 --- a/src/te/schedule/schedule_dataflow_rewrite.cc +++ b/src/te/schedule/schedule_dataflow_rewrite.cc @@ -507,7 +507,7 @@ void RebaseNonZeroMinLoop(ScheduleNode* sch) { void InjectInline(ScheduleNode* sch, bool feature_extraction_mode) { sch->InvalidateCache(); - std::vector > new_body(sch->stages.size()); + std::vector> new_body(sch->stages.size()); std::vector changed(sch->stages.size(), false); std::vector new_hybrid_body(sch->stages.size()); std::vector hybrid_changed(sch->stages.size(), false); diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc index 1ac0f1f1705e..cae4109a6026 100644 --- a/src/tir/ir/buffer.cc +++ b/src/tir/ir/buffer.cc @@ -152,7 +152,7 @@ inline std::pair MergeMulModInner(arith::Analyzer* analyzer, // Otherwise, the elements will be added to the no_opt_sum variable inline void MergeMulModInsertElements(const std::vector& eles, std::list* mult_exprs, - std::list >* mod_exprs, + std::list>* mod_exprs, PrimExpr* no_opt_sum, bool* has_mult, bool* has_mod) { using namespace tir; *has_mult = false; @@ -194,13 +194,13 @@ inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) { simplified_base = analyzer->Simplify(simplified_base); std::vector eles = ExprSplitAddition(simplified_base); std::list mult_exprs; - std::list > mod_exprs; + std::list> mod_exprs; PrimExpr no_opt_sum; bool has_mult; bool has_mod; MergeMulModInsertElements(eles, &mult_exprs, &mod_exprs, &no_opt_sum, &has_mult, &has_mod); bool find_opt = false; - std::list >::iterator search_mod_it = mod_exprs.begin(); + std::list>::iterator search_mod_it = mod_exprs.begin(); // 2. Exhaustive Search while (search_mod_it != mod_exprs.end()) { std::list::iterator mult_it = mult_exprs.begin(); @@ -238,7 +238,7 @@ inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) { for (std::list::iterator it = mult_exprs.begin(); it != mult_exprs.end(); ++it) { no_opt_sum = no_opt_sum.get() ? no_opt_sum + *it : *it; } - for (std::list >::iterator it = mod_exprs.begin(); + for (std::list>::iterator it = mod_exprs.begin(); it != mod_exprs.end(); ++it) { no_opt_sum = no_opt_sum.get() ? no_opt_sum + indexmod(it->first, it->second) : indexmod(it->first, it->second); diff --git a/src/tir/transforms/coproc_sync.cc b/src/tir/transforms/coproc_sync.cc index f3a9f990599f..1b1cabeadb71 100644 --- a/src/tir/transforms/coproc_sync.cc +++ b/src/tir/transforms/coproc_sync.cc @@ -111,7 +111,7 @@ class CoProcSyncPlanner : public StorageAccessVisitor { } // Write synchronization to be inserted before or after stmt. - std::unordered_map > sync_; + std::unordered_map> sync_; protected: bool Enabled(const VarNode* buf, const StorageScope& scope) const final { @@ -230,8 +230,8 @@ class CoProcBarrierDetector : public StorageAccessVisitor { PlanWriteBarrier(scope_.back(), nullptr); } - std::unordered_map > barrier_before_; - std::unordered_map > barrier_after_; + std::unordered_map> barrier_before_; + std::unordered_map> barrier_after_; protected: bool Enabled(const VarNode* buf, const StorageScope& scope) const final { @@ -251,7 +251,7 @@ class CoProcBarrierDetector : public StorageAccessVisitor { // Plan write barrier at Read after write point. std::vector PlanWriteBarrier(std::vector seq, const ForNode* loop) { std::vector read_seq; - std::unordered_map > write_set; + std::unordered_map> write_set; auto fupdate = [&](size_t i, const AccessEntry& acc) { auto it = write_set.find(acc.buffer.get()); @@ -289,7 +289,7 @@ class CoProcBarrierDetector : public StorageAccessVisitor { std::vector PlanReadBarrier(std::vector seq, const ForNode* loop) { std::vector write_seq; - std::unordered_map > read_set; + std::unordered_map> read_set; auto fupdate = [&](size_t i, const AccessEntry& acc) { auto it = read_set.find(acc.buffer.get()); @@ -443,8 +443,8 @@ class CoProcInstDepDetector : public StmtVisitor { // insert before is stored in reverse order // the first element is closest to the node. - std::unordered_map > insert_before_; - std::unordered_map > insert_after_; + std::unordered_map> insert_before_; + std::unordered_map> insert_after_; private: // state in the sync entry @@ -456,9 +456,9 @@ class CoProcInstDepDetector : public StmtVisitor { // Set of all possible contexts in the exit moment. std::unordered_set exit_ctx; // existing pop performed at enter - std::vector > enter_pop; + std::vector> enter_pop; // existing push performed at exit - std::vector > exit_push; + std::vector> exit_push; // clear the state void clear() { node = nullptr; @@ -473,8 +473,8 @@ class CoProcInstDepDetector : public StmtVisitor { // return the push/pop message at enter/exit of the Block // after considering the existing unmatcheded events and added events void InjectSync(const SyncState& prev, const SyncState& next, - std::vector >* prev_exit_push, - std::vector >* next_enter_pop) { + std::vector>* prev_exit_push, + std::vector>* next_enter_pop) { prev_exit_push->clear(); next_enter_pop->clear(); // quick path @@ -491,9 +491,9 @@ class CoProcInstDepDetector : public StmtVisitor { return; } // complicate path. - std::vector > vpush = prev.exit_push; - std::vector > vpop = next.enter_pop; - std::vector > pending; + std::vector> vpush = prev.exit_push; + std::vector> vpop = next.enter_pop; + std::vector> pending; for (int from : prev.exit_ctx) { for (int to : next.enter_ctx) { if (from != to) { @@ -556,7 +556,7 @@ class CoProcInstDepDetector : public StmtVisitor { void UpdateState() { if (last_state_.node != nullptr) { - std::vector > t1, t2; + std::vector> t1, t2; InjectSync(last_state_, curr_state_, &t1, &t2); std::swap(last_state_, curr_state_); } else { @@ -642,8 +642,8 @@ class CoProcSyncInserter : public StmtMutator { private: // insert before is stored in reverse order // the first element is closest to the node. - std::unordered_map > insert_before_; - std::unordered_map > insert_after_; + std::unordered_map> insert_before_; + std::unordered_map> insert_after_; }; Stmt CoProcSync(Stmt stmt) { return CoProcSyncInserter().Insert(std::move(stmt)); } diff --git a/src/tir/transforms/inject_double_buffer.cc b/src/tir/transforms/inject_double_buffer.cc index 03f2ccd40dd1..d974e3c8108a 100644 --- a/src/tir/transforms/inject_double_buffer.cc +++ b/src/tir/transforms/inject_double_buffer.cc @@ -299,9 +299,9 @@ class DoubleBufferInjector : public StmtExprMutator { // The current loop next std::vector loop_nest_; // The allocs to be appended before the loop - std::unordered_map > loop_allocs_; + std::unordered_map> loop_allocs_; // The stmt to be appended before the loop - std::unordered_map > loop_pre_; + std::unordered_map> loop_pre_; // The allocation size of the buffer std::unordered_map dbuffer_info_; // The updated Buffer objects diff --git a/src/tir/transforms/inject_virtual_thread.cc b/src/tir/transforms/inject_virtual_thread.cc index 83722d7b8aab..455140c75c13 100644 --- a/src/tir/transforms/inject_virtual_thread.cc +++ b/src/tir/transforms/inject_virtual_thread.cc @@ -177,7 +177,7 @@ class VarTouchedAnalysis : public StmtVisitor { // Whether variable is touched by the thread variable. std::unordered_set touched_var_; // x -> all the buffers x read from - std::unordered_map > affect_; + std::unordered_map> affect_; }; // Inject virtual thread loop diff --git a/src/tir/transforms/ir_utils.h b/src/tir/transforms/ir_utils.h index a54eebe4ed05..6915a0e3acc9 100644 --- a/src/tir/transforms/ir_utils.h +++ b/src/tir/transforms/ir_utils.h @@ -54,7 +54,7 @@ Stmt MergeNest(const std::vector& nest, Stmt body); * \param body body * \return The combined Stmt */ -Stmt MergeNest(const std::vector >& nest, Stmt body); +Stmt MergeNest(const std::vector>& nest, Stmt body); /*! * \brief update array with an unary function diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc index 35c96e4fe4e1..4f8ad1223cd2 100644 --- a/src/tir/transforms/make_packed_api.cc +++ b/src/tir/transforms/make_packed_api.cc @@ -204,8 +204,8 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) { } // Need to re-declare vars, in case some arguments also appears in the buffer. - std::vector > var_def; - std::vector > buffer_def; + std::vector> var_def; + std::vector> buffer_def; for (int i = 0; i < static_cast(func_ptr->params.size()); ++i) { Var param = func_ptr->params[i]; @@ -343,7 +343,7 @@ Pass MakePackedAPI(int num_unpacked_args) { // packed arguments anyway while `num_unpacked_args` is -1 auto pass_func = [num_unpacked_args](IRModule m, PassContext ctx) { IRModuleNode* mptr = m.CopyOnWrite(); - std::vector > updates; + std::vector> updates; for (const auto& kv : mptr->functions) { if (auto* n = kv.second.as()) { diff --git a/src/tir/transforms/storage_access.h b/src/tir/transforms/storage_access.h index a48ee73f17fc..ac64e2f5cb65 100644 --- a/src/tir/transforms/storage_access.h +++ b/src/tir/transforms/storage_access.h @@ -125,7 +125,7 @@ class StorageAccessVisitor : public StmtExprVisitor { */ StorageScope GetScope(Var buffer_var) const; // access scope - std::vector > scope_; + std::vector> scope_; private: // whether access appending is enabled. diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc index acb052650036..177017f9a245 100644 --- a/src/tir/transforms/storage_rewrite.cc +++ b/src/tir/transforms/storage_rewrite.cc @@ -1010,11 +1010,11 @@ class StoragePlanRewriter : public StmtExprMutator { // symbolic free list, for non constant items. std::list sym_free_list_; // The allocation attach map - std::unordered_map > attach_map_; + std::unordered_map> attach_map_; // The allocation assign map std::unordered_map alloc_map_; // The allocations - std::vector > alloc_vec_; + std::vector> alloc_vec_; // The buffer objects being remapped std::unordered_map buffer_remap_; // analyzer From 7f1856d34f03113dc3a7733c010be43446161944 Mon Sep 17 00:00:00 2001 From: Adam Straw Date: Fri, 26 Aug 2022 14:22:04 -0700 Subject: [PATCH 060/704] [Hexagon] Asynchronous DMA support (#12411) Adds adds asynchronous DMA support through the Hexagon User DMA engine with unit tests to validate basic functionality. Asynchronous DMA support here means the ability to "kick off" asynchronously a number of DMAs using the Copy API and then to Poll for or Wait on a number of "in flight" (not done) DMAs. Enables future testing and development for asynchronous memory copy on Hexagon. For now, Hexagon DMA support remains synchronous in nature through existing hexagon_user_dma_1d_sync interface which uses asynchronous capable HexagonUserDMA class in a synchronous way --- calling Copy and Wait back to back for each request. * use ring buffer to store DMA descriptors * add RingBuffer class; used by HexUserDMA to store descriptors * add test to overflow the HexagonUserDMA ring buffer --- src/runtime/hexagon/hexagon_device_api.cc | 3 +- src/runtime/hexagon/hexagon_user_dma.cc | 112 ++++++----- src/runtime/hexagon/hexagon_user_dma.h | 97 +++++++++ .../hexagon/hexagon_user_dma_descriptors.h | 2 - .../hexagon/hexagon_user_dma_instructions.h | 8 +- src/runtime/hexagon/ring_buffer.h | 94 +++++++++ .../hexagon/hexagon_user_dma_tests.cc | 178 ++++++++++++++++ .../cpp-runtime/hexagon/ring_buffer_tests.cc | 190 ++++++++++++++++++ 8 files changed, 631 insertions(+), 53 deletions(-) create mode 100644 src/runtime/hexagon/hexagon_user_dma.h create mode 100644 src/runtime/hexagon/ring_buffer.h create mode 100644 tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc create mode 100644 tests/cpp-runtime/hexagon/ring_buffer_tests.cc diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index 92a7b22784fb..f22afca10bfa 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -170,7 +170,8 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM void* src = args[1]; int size = args[2]; - hexagon_user_dma_1d_sync(dst, src, size); + int error_code = hexagon_user_dma_1d_sync(dst, src, size); + CHECK_EQ(error_code, 0); *rv = static_cast(0); }); diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc index 0e3fbd2048f6..8d45b7590bc4 100644 --- a/src/runtime/hexagon/hexagon_user_dma.cc +++ b/src/runtime/hexagon/hexagon_user_dma.cc @@ -17,66 +17,47 @@ * under the License. */ -#include +#include "hexagon_user_dma.h" -#include "hexagon_common.h" -#include "hexagon_user_dma_descriptors.h" -#include "hexagon_user_dma_instructions.h" -#include "hexagon_user_dma_registers.h" +#include namespace tvm { namespace runtime { namespace hexagon { -int init_hexagon_user_dma() { -#if __HEXAGON_ARCH__ >= 68 - // reset DMA engine +unsigned int HexagonUserDMA::Init() { unsigned int status = dmpause() & DM0_STATUS_MASK; - if (status != DM0_STATUS_IDLE) { - return DMA_FAILURE; - } -#endif - return DMA_SUCCESS; + return status; } -int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) { -#if __HEXAGON_ARCH__ >= 68 - static int config_dma = init_hexagon_user_dma(); - if (config_dma != DMA_SUCCESS) { +int HexagonUserDMA::Copy(void* dst, void* src, uint32_t length) { + // length limited to 24 bits + if (length > DESC_LENGTH_MASK) { return DMA_FAILURE; } - uint64_t src64 = reinterpret_cast(src); // source address limited to 32 bits - if (src64 > DESC_SRC_MASK) { + uint64_t src64 = reinterpret_cast(src); + if (!src64 || src64 > DESC_SRC_MASK) { return DMA_FAILURE; } - uint64_t dst64 = reinterpret_cast(dst); // destination address limited to 32 bits - if (dst64 > DESC_DST_MASK) { - return DMA_FAILURE; - } - - // length limited to 24 bits - if (length > DESC_LENGTH_MASK) { + uint64_t dst64 = reinterpret_cast(dst); + if (!dst64 || dst64 > DESC_DST_MASK) { return DMA_FAILURE; } - uint32_t src32 = src64 & DESC_SRC_MASK; - uint32_t dst32 = dst64 & DESC_DST_MASK; - - void* dma_desc = nullptr; - - int ret = posix_memalign(&dma_desc, DMA_DESC_2D_SIZE, DMA_DESC_2D_SIZE); - if (ret) { - return DMA_FAILURE; - } + uint32_t src32 = static_cast(src64); + uint32_t dst32 = static_cast(dst64); + // get pointer to next descriptor + dma_desc_2d_t* dma_desc = descriptors_->Next(); if (!dma_desc) { - return DMA_FAILURE; + return DMA_RETRY; } + // populate descriptor fields dma_desc_set_state(dma_desc, DESC_STATE_READY); dma_desc_set_next(dma_desc, DMA_NULL_PTR); dma_desc_set_length(dma_desc, length); @@ -90,23 +71,60 @@ int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) { dma_desc_set_src(dma_desc, src32); dma_desc_set_dst(dma_desc, dst32); - dmstart(dma_desc); - unsigned int status = dmwait() & DM0_STATUS_MASK; - unsigned int done = dma_desc_get_done(dma_desc); + if (first_dma_) { + // `dmstart` first descriptor + dmstart(dma_desc); + first_dma_ = false; + } else { + // `dmlink` descriptor to tail descriptor + dmlink(tail_dma_desc_, dma_desc); + } - free(dma_desc); + // update tail + tail_dma_desc_ = dma_desc; + return DMA_SUCCESS; +} - if (status == DM0_STATUS_IDLE && done == DESC_DONE_COMPLETE) { - return DMA_SUCCESS; +void HexagonUserDMA::Wait(uint32_t max_dmas_in_flight) { + // wait (forever) until max DMAs in flight <= actual DMAs in flight + while (DMAsInFlight() > max_dmas_in_flight) { } -#endif - return DMA_FAILURE; +} + +uint32_t HexagonUserDMA::Poll() { return DMAsInFlight(); } + +uint32_t HexagonUserDMA::DMAsInFlight() { + dmpoll(); // update DMA engine status + return descriptors_->InFlight(); +} + +HexagonUserDMA::HexagonUserDMA() { + // reset DMA engine + unsigned int status = Init(); + CHECK_EQ(status, DM0_STATUS_IDLE); + + auto desc_in_flight = [](dma_desc_2d_t* dma_desc) { + unsigned int done = dma_desc_get_done(dma_desc); + return (done != DESC_DONE_COMPLETE); + }; + descriptors_ = new RingBuffer(MAX_DMA_DESCRIPTORS, desc_in_flight); +} + +HexagonUserDMA::~HexagonUserDMA() { + Init(); // stop DMA engine + delete descriptors_; } int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) { // One DMA transfer can copy at most DESC_LENGTH_MASK bytes. // Make the common case quick. - if (length <= DESC_LENGTH_MASK) return hexagon_user_dma_1d_sync_helper(dst, src, length); + if (length <= DESC_LENGTH_MASK) { + // sync DMA -> `Copy` and then `Wait(0)` + int ret_val = HexagonUserDMA::Get().Copy(dst, src, length); + if (ret_val != DMA_SUCCESS) return ret_val; + HexagonUserDMA::Get().Wait(0); + return DMA_SUCCESS; + } // Split big transfers into smaller transfers. char* cast_src = static_cast(src); @@ -114,8 +132,10 @@ int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) { for (uint32_t i = 0; i < length;) { // Ensure there is no overflow while updating i uint32_t cur_len = std::min(length - i, DESC_LENGTH_MASK); - int ret_val = hexagon_user_dma_1d_sync_helper(&cast_dst[i], &cast_src[i], cur_len); + // sync DMA -> `Copy` and then `Wait(0)` + int ret_val = HexagonUserDMA::Get().Copy(&cast_dst[i], &cast_src[i], cur_len); if (ret_val != DMA_SUCCESS) return ret_val; + HexagonUserDMA::Get().Wait(0); // 2 cases for new val for i: // 1. length - i <= DESC_LENGTH_MASK (<= MAX_UINT) // new_i = i + (length - i) = length, no more iter diff --git a/src/runtime/hexagon/hexagon_user_dma.h b/src/runtime/hexagon/hexagon_user_dma.h new file mode 100644 index 000000000000..aa00df79c4d0 --- /dev/null +++ b/src/runtime/hexagon/hexagon_user_dma.h @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_ +#define TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_ + +#include "hexagon_common.h" +#include "hexagon_user_dma_descriptors.h" +#include "hexagon_user_dma_instructions.h" +#include "hexagon_user_dma_registers.h" +#include "ring_buffer.h" + +namespace tvm { +namespace runtime { +namespace hexagon { + +#define DMA_SUCCESS 0 +#define DMA_FAILURE -1 +#define DMA_RETRY 1 +#define MAX_DMA_DESCRIPTORS 100 + +class HexagonUserDMA { + public: + /*! + * \brief Initiate DMA to copy memory from source to destination address + * \param dst Destination address + * \param src Source address + * \param length Length in bytes to copy + * \returns Status: DMA_SUCCESS or DMA_FAILURE + */ + int Copy(void* dst, void* src, uint32_t length); + + /*! + * \brief Wait until the number of DMAs in flight is less than or equal to some maximum + * \param max_dmas_in_flight Maximum number of DMAs allowed to be in flight + * to satisfy the `Wait` e.g. use `Wait(0)` to wait on "all" outstanding DMAs to complete + */ + void Wait(uint32_t max_dmas_in_flight); + + /*! + * \brief Poll the number of DMAs in flight + * \returns Number of DMAs in flight + */ + uint32_t Poll(); + + //! \brief HexagonUserDMA uses the singleton pattern + static HexagonUserDMA& Get() { + static HexagonUserDMA* hud = new HexagonUserDMA(); + return *hud; + } + + private: + // HexagonUserDMA uses the singleton pattern + HexagonUserDMA(); + ~HexagonUserDMA(); + HexagonUserDMA(const HexagonUserDMA&) = delete; + HexagonUserDMA& operator=(const HexagonUserDMA&) = delete; + HexagonUserDMA(HexagonUserDMA&&) = delete; + HexagonUserDMA& operator=(HexagonUserDMA&&) = delete; + + //! \brief Initializes the Hexagon User DMA engine + unsigned int Init(); + + //! \brief Calculates and returns the number of DMAs in flight + uint32_t DMAsInFlight(); + + //! \brief Tracks whether the very first DMA has been executed + bool first_dma_{true}; + + //! \brief Tracks the tail DMA descriptor + void* tail_dma_desc_{nullptr}; + + //! \brief Storage for all DMA descriptors + RingBuffer* descriptors_{nullptr}; +}; + +} // namespace hexagon +} // namespace runtime +} // namespace tvm + +#endif // TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_H_ diff --git a/src/runtime/hexagon/hexagon_user_dma_descriptors.h b/src/runtime/hexagon/hexagon_user_dma_descriptors.h index 643dbc5e8bf5..913b025df138 100644 --- a/src/runtime/hexagon/hexagon_user_dma_descriptors.h +++ b/src/runtime/hexagon/hexagon_user_dma_descriptors.h @@ -126,8 +126,6 @@ namespace hexagon { #define DESC_DSTWIDTHOFFSET_MASK 0xFFFF0000 #define DESC_DSTWIDTHOFFSET_SHIFT 16 -#define DMA_SUCCESS 0 -#define DMA_FAILURE -1 #define DMA_NULL_PTR 0 /**************************/ diff --git a/src/runtime/hexagon/hexagon_user_dma_instructions.h b/src/runtime/hexagon/hexagon_user_dma_instructions.h index e160b7395658..2345d4daaf21 100644 --- a/src/runtime/hexagon/hexagon_user_dma_instructions.h +++ b/src/runtime/hexagon/hexagon_user_dma_instructions.h @@ -24,8 +24,6 @@ namespace tvm { namespace runtime { namespace hexagon { -#if __HEXAGON_ARCH__ >= 68 - inline unsigned int dmpause() { unsigned int dm0 = 0; asm volatile(" %0 = dmpause" : "=r"(dm0)); @@ -34,6 +32,10 @@ inline unsigned int dmpause() { inline void dmstart(void* next) { asm volatile(" dmstart(%0)" : : "r"(next)); } +inline void dmlink(void* tail, void* next) { + asm volatile(" dmlink(%0, %1)" : : "r"(tail), "r"(next)); +} + inline unsigned int dmpoll() { unsigned int dm0 = 0; asm volatile(" %0 = dmpoll" : "=r"(dm0)); @@ -70,8 +72,6 @@ inline void dmcfgwr(unsigned int dmindex, unsigned int data) { asm volatile(" dmcfgwr(%0, %1)" : : "r"(dmindex), "r"(data)); } -#endif - } // namespace hexagon } // namespace runtime } // namespace tvm diff --git a/src/runtime/hexagon/ring_buffer.h b/src/runtime/hexagon/ring_buffer.h new file mode 100644 index 000000000000..d21b2b9953c2 --- /dev/null +++ b/src/runtime/hexagon/ring_buffer.h @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef TVM_RUNTIME_HEXAGON_RING_BUFFER_H_ +#define TVM_RUNTIME_HEXAGON_RING_BUFFER_H_ + +#include + +#include "hexagon_common.h" + +namespace tvm { +namespace runtime { +namespace hexagon { + +template +class RingBuffer { + public: + //! \brief Returns the number of Ts in flight + uint32_t InFlight() { + while (id_oldest_ < id_next_ && !in_flight_(GetAddr(id_oldest_))) { + id_oldest_++; + } + return id_next_ - id_oldest_; + } + + //! \brief Returns pointer to next T; null if ring buffer is full + T* Next() { + if (InFlight() == ring_buff_size_) { + return nullptr; + } + T* next = GetAddr(id_next_); + id_next_++; + return next; + } + + /*! \brief Creates a ring buffer for storage items of type T + * \param ring_buff_size Size of the ring buffer in number of Ts + * \param in_flight Function that determines whether a T is in flight + */ + RingBuffer(uint32_t ring_buff_size, std::function in_flight) + : ring_buff_size_(ring_buff_size), in_flight_(in_flight) { + CHECK_NE(ring_buff_size, 0); + int ret = posix_memalign(reinterpret_cast(&ring_buff_ptr_), sizeof(T), + sizeof(T) * ring_buff_size_); + CHECK_EQ(ret, 0); + CHECK_NE(ring_buff_ptr_, nullptr); + } + + ~RingBuffer() { free(ring_buff_ptr_); } + + private: + //! \brief Returns the address of a T given its index + T* GetAddr(uint32_t id) const { + uint32_t ring_buff_index = id % ring_buff_size_; + return ring_buff_ptr_ + ring_buff_index; + } + + //! \brief Pointer to the ring buffer + T* ring_buff_ptr_{nullptr}; + + //! \brief Size of the ring buffer in number of Ts + const uint32_t ring_buff_size_; + + //! \brief Function that determines whether a T is in flight + const std::function in_flight_; + + //! \brief Tracks the ID of the next T to be added to the ring buffer + uint32_t id_next_{0}; + + //! \brief Tracks the ID of the oldest T in flight + uint32_t id_oldest_{0}; +}; + +} // namespace hexagon +} // namespace runtime +} // namespace tvm + +#endif // TVM_RUNTIME_HEXAGON_RING_BUFFER_H_ diff --git a/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc new file mode 100644 index 000000000000..bf7a23712d7d --- /dev/null +++ b/tests/cpp-runtime/hexagon/hexagon_user_dma_tests.cc @@ -0,0 +1,178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include "../src/runtime/hexagon/hexagon_user_dma.h" + +using namespace tvm::runtime; +using namespace tvm::runtime::hexagon; + +class HexagonUserDMATest : public ::testing::Test { + void SetUp() override { + src = malloc(length); + dst = malloc(length); + ASSERT_NE(src, nullptr); + ASSERT_NE(dst, nullptr); + + src_char = static_cast(src); + dst_char = static_cast(dst); + for (uint32_t i = 0; i < length; ++i) { + src_char[i] = 1; + dst_char[i] = 0; + } + } + void TearDown() override { + free(src); + free(dst); + } + + public: + int ret{0}; + void* src{nullptr}; + void* dst{nullptr}; + char* src_char{nullptr}; + char* dst_char{nullptr}; + uint32_t length{0x4000}; // 16KB +}; + +TEST_F(HexagonUserDMATest, wait) { + HexagonUserDMA::Get().Wait(0); + HexagonUserDMA::Get().Wait(10); +} + +TEST_F(HexagonUserDMATest, poll) { ASSERT_EQ(HexagonUserDMA::Get().Poll(), 0); } + +TEST_F(HexagonUserDMATest, bad_copy) { + uint64_t bigaddr = 0x100000000; + void* src64 = reinterpret_cast(bigaddr); + void* dst64 = reinterpret_cast(bigaddr); + uint32_t biglength = 0x1000000; + ASSERT_NE(HexagonUserDMA::Get().Copy(dst64, src, length), DMA_SUCCESS); + ASSERT_NE(HexagonUserDMA::Get().Copy(dst, src64, length), DMA_SUCCESS); + ASSERT_NE(HexagonUserDMA::Get().Copy(dst, src, biglength), DMA_SUCCESS); +} + +TEST_F(HexagonUserDMATest, sync_dma) { + // kick off 1 DMA + ret = HexagonUserDMA::Get().Copy(dst, src, length); + ASSERT_EQ(ret, DMA_SUCCESS); + + // wait for DMA to complete + HexagonUserDMA::Get().Wait(0); + + // verify + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(src_char[i], dst_char[i]); + } +} + +TEST_F(HexagonUserDMATest, async_dma_wait) { + // kick off 10x duplicate DMAs + for (uint32_t i = 0; i < 10; ++i) { + ret = HexagonUserDMA::Get().Copy(dst, src, length); + ASSERT_EQ(ret, DMA_SUCCESS); + } + + // wait for at least 1 DMA to complete + HexagonUserDMA::Get().Wait(9); + + // verify + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(src_char[i], dst_char[i]); + } + + // empty the DMA queue + HexagonUserDMA::Get().Wait(0); +} + +TEST_F(HexagonUserDMATest, async_dma_poll) { + // kick off 10x duplicate DMAs + for (uint32_t i = 0; i < 10; ++i) { + ret = HexagonUserDMA::Get().Copy(dst, src, length); + ASSERT_EQ(ret, DMA_SUCCESS); + } + + // poll until at least 1 DMA is complete + while (HexagonUserDMA::Get().Poll() == 10) { + }; + + // verify + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(src_char[i], dst_char[i]); + } + + // empty the DMA queue + HexagonUserDMA::Get().Wait(0); +} + +// TODO: Run non-pipelined case with sync DMA and execution time vs. pipelined case +TEST_F(HexagonUserDMATest, pipeline) { + uint32_t pipeline_depth = 4; + uint32_t pipeline_length = length / pipeline_depth; + + for (uint32_t i = 0; i < pipeline_depth; ++i) { + ret |= HexagonUserDMA::Get().Copy(dst_char + i * pipeline_length, + src_char + i * pipeline_length, pipeline_length); + } + + HexagonUserDMA::Get().Wait(3); + for (uint32_t i = 0; i < pipeline_length; ++i) { + dst_char[i]++; + } + + HexagonUserDMA::Get().Wait(2); + for (uint32_t i = pipeline_length; i < 2 * pipeline_length; ++i) { + dst_char[i]++; + } + + HexagonUserDMA::Get().Wait(1); + for (uint32_t i = 2 * pipeline_length; i < 3 * pipeline_length; ++i) { + dst_char[i]++; + } + + HexagonUserDMA::Get().Wait(0); + for (uint32_t i = 3 * pipeline_length; i < 4 * pipeline_length; ++i) { + dst_char[i]++; + } + + // verify + ASSERT_EQ(ret, DMA_SUCCESS); + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(2, dst_char[i]); + } +} + +TEST_F(HexagonUserDMATest, overflow_ring_buffer) { + uint32_t number_of_dmas = 0x400; // 1k + uint32_t length_of_each_dma = length / number_of_dmas; + + for (uint32_t i = 0; i < number_of_dmas; ++i) { + do { + ret = HexagonUserDMA::Get().Copy(dst_char + i * length_of_each_dma, + src_char + i * length_of_each_dma, length_of_each_dma); + } while (ret == DMA_RETRY); + ASSERT_EQ(ret, DMA_SUCCESS); + } + + // verify + for (uint32_t i = 0; i < length; ++i) { + ASSERT_EQ(src_char[i], dst_char[i]); + } +} \ No newline at end of file diff --git a/tests/cpp-runtime/hexagon/ring_buffer_tests.cc b/tests/cpp-runtime/hexagon/ring_buffer_tests.cc new file mode 100644 index 000000000000..cd40dca87b02 --- /dev/null +++ b/tests/cpp-runtime/hexagon/ring_buffer_tests.cc @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include "../src/runtime/hexagon/ring_buffer.h" + +using namespace tvm::runtime; +using namespace tvm::runtime::hexagon; + +class RingBufferTest : public ::testing::Test { + void SetUp() override { ring_buff = new RingBuffer(size, in_flight); } + void TearDown() override { delete ring_buff; } + + public: + std::function in_flight = [](int* ptr) { + if (*ptr == 42) { + // finished + return false; + } + // in flight + return true; + }; + + int finished = 42; + int inflight = 43; + uint32_t size = 4; + uint32_t half = size / 2; + RingBuffer* ring_buff; +}; + +TEST_F(RingBufferTest, zero_size_ring_buffer) { + ASSERT_THROW(RingBuffer(0, in_flight), InternalError); +} + +TEST_F(RingBufferTest, in_flight) { ASSERT_EQ(ring_buff->InFlight(), 0); } + +TEST_F(RingBufferTest, next) { + // get pointer to first item + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + + // mark it in flight and check + *ptr = inflight; + ASSERT_EQ(ring_buff->InFlight(), 1); + + // mark it finished and check + *ptr = finished; + ASSERT_EQ(ring_buff->InFlight(), 0); +} + +TEST_F(RingBufferTest, full) { + // fill the ring buffer + for (int i = 0; i < size; ++i) { + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + + // mark in flight and check + *ptr = inflight; + ASSERT_EQ(ring_buff->InFlight(), i + 1); + } + + // check that the ring buffer is full + ASSERT_EQ(ring_buff->Next(), nullptr); + ASSERT_EQ(ring_buff->InFlight(), size); +} + +TEST_F(RingBufferTest, wrap) { + // fill the ring buffer, but mark each finished + bool first = true; + int* firstptr = nullptr; + for (int i = 0; i < size; ++i) { + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + + // save first ptr for later comparison + if (first) { + firstptr = ptr; + first = false; + } + + // mark finished and check + *ptr = finished; + ASSERT_EQ(ring_buff->InFlight(), 0); + } + + // reuse the first ring buffer entry + int* ptr = ring_buff->Next(); + ASSERT_EQ(ptr, firstptr); + + // mark it in flight and check + *ptr = inflight; + ASSERT_EQ(ring_buff->InFlight(), 1); + + // mark it finished and check + *ptr = finished; + ASSERT_EQ(ring_buff->InFlight(), 0); +} + +TEST_F(RingBufferTest, wrap_corner) { + for (int i = 0; i < size; ++i) { + int* ptr = ring_buff->Next(); + *ptr = finished; + } + + // reuse the first ring buffer entry + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + + // user must mark the item "inflight" before checking in flight count + // here the "finished" status is inherited from the reused ring buffer entry + // thus the in flight count is zero instead one; which the user might expect + ASSERT_EQ(ring_buff->InFlight(), 0); + + // marking the item "inflight" after checking the in flight count + // will not change the outcome; the ring buffer considers the item "finished" + *ptr = inflight; + ASSERT_EQ(ring_buff->InFlight(), 0); +} + +TEST_F(RingBufferTest, half_in_flight) { + // these will complete + for (int i = 0; i < half; ++i) { + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + *ptr = finished; + ASSERT_EQ(ring_buff->InFlight(), 0); + } + + // these will not complete + for (int i = 0; i < half; ++i) { + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + *ptr = inflight; + ASSERT_EQ(ring_buff->InFlight(), i + 1); + } + + // check half in flight + ASSERT_EQ(ring_buff->InFlight(), half); + + // get pointer to next item + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + + // mark it inflight and check + *ptr = inflight; + ASSERT_EQ(ring_buff->InFlight(), 3); + + // mark it finished and check also blocked + *ptr = finished; + ASSERT_EQ(ring_buff->InFlight(), 3); +} + +TEST_F(RingBufferTest, half_in_flight_blocked) { + // these will not complete + for (int i = 0; i < half; ++i) { + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + *ptr = inflight; + ASSERT_EQ(ring_buff->InFlight(), i + 1); + } + + // these would complete, but they are blocked + for (int i = half; i < size; ++i) { + int* ptr = ring_buff->Next(); + ASSERT_NE(ptr, nullptr); + *ptr = finished; + ASSERT_EQ(ring_buff->InFlight(), i + 1); + } + + // check that the ring buffer is full + ASSERT_EQ(ring_buff->Next(), nullptr); + ASSERT_EQ(ring_buff->InFlight(), size); +} From 370abe69d24519a5453cead846d328a1c378957f Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Fri, 26 Aug 2022 20:20:42 -0700 Subject: [PATCH 061/704] [MetaSchedule][UX] Make `Database` with-able (#12520) `ApplyHistoryBest` right now plays a role as the database adaptor to query inside the database. In fact, the logic could be simplified and users only have to deal with `Database` instead of this extra object. - [x] Add `EnterWithScope`/`ExitWithScope`/`Current` to Database - [x] Migrate `te_filter_func` => "tir_filter" in Relay's pass context - [x] Migrate `f_take_tuning_record` => "Database.query_tuning_record" - [x] Migrate `TECompiler` to use `Database` - [x] Remove apply-history-best Next PR: - Migrate `f_direct_dispatch` (potentially unify with `apply_fixed_schedule`?) --- .../tvm/meta_schedule/apply_history_best.h | 115 ------------ include/tvm/meta_schedule/database.h | 28 +++ include/tvm/meta_schedule/extracted_task.h | 20 --- .../tvm/auto_scheduler/testing/tune_relay.py | 93 +++++----- python/tvm/meta_schedule/__init__.py | 1 - .../tvm/meta_schedule/apply_history_best.py | 130 -------------- python/tvm/meta_schedule/database/database.py | 104 ++++++++++- python/tvm/meta_schedule/default_config.py | 4 - python/tvm/meta_schedule/relay_integration.py | 29 ++- .../tvm/meta_schedule/testing/tune_relay.py | 30 +++- python/tvm/meta_schedule/testing/utils.py | 26 +-- python/tvm/meta_schedule/tune.py | 12 +- src/meta_schedule/apply_history_best.cc | 165 ------------------ src/meta_schedule/database/database.cc | 64 +++++++ src/meta_schedule/extracted_task.cc | 70 -------- src/meta_schedule/utils.h | 1 - src/relay/backend/task_extraction.cc | 25 +-- src/relay/backend/te_compiler.cc | 1 + src/relay/backend/te_compiler_cache.cc | 70 ++++---- src/relay/backend/utils.cc | 73 ++++++++ src/relay/backend/utils.h | 31 ++++ .../test_meta_schedule_auto_tensorize.py | 25 ++- tests/python/unittest/test_link_params.py | 19 +- .../test_meta_schedule_integration.py | 62 +------ .../test_meta_schedule_multi_anchor.py | 2 +- .../test_meta_schedule_relay_tir_compute.py | 18 +- .../unittest/test_meta_schedule_tune_relay.py | 57 +++--- 27 files changed, 511 insertions(+), 764 deletions(-) delete mode 100644 include/tvm/meta_schedule/apply_history_best.h delete mode 100644 python/tvm/meta_schedule/apply_history_best.py delete mode 100644 src/meta_schedule/apply_history_best.cc diff --git a/include/tvm/meta_schedule/apply_history_best.h b/include/tvm/meta_schedule/apply_history_best.h deleted file mode 100644 index 44a34b3ee496..000000000000 --- a/include/tvm/meta_schedule/apply_history_best.h +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -#ifndef TVM_META_SCHEDULE_APPLY_HISTORY_BEST_H_ -#define TVM_META_SCHEDULE_APPLY_HISTORY_BEST_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace tvm { -namespace meta_schedule { - -/*! - * \brief An integration context that allows application of historically best records from a - * database - */ -class ApplyHistoryBestNode : public runtime::Object { - public: - /*! \brief A callback function that filters TE compute */ - using FTEFilterFunc = runtime::TypedPackedFunc( - const Array&, const Array&)>; - /*! \brief A callback function that takes a tuning record and does something with it */ - using FTakeTuningRecord = runtime::TypedPackedFunc; - using FDirectDispatch = runtime::TypedPackedFunc(const IRModule&)>; - - /*! \brief The database to be queried from */ - Database database{nullptr}; - /*! \brief The filtering function for TE computation */ - FTEFilterFunc te_filter_func{nullptr}; - /*! \brief The logging function to be used */ - PackedFunc logging_func; - - void VisitAttrs(tvm::AttrVisitor* v) { - v->Visit("database", &database); - // `te_filter_func` is not visited - // `logging_func` is not visited - } - /*! - * \brief Query the best entry from the database - * \param task_name The name of the task to be queried - * \param mod The module to be queried - * \param target The target to be queried - * \param dispatched The IRs after dispatch - * \param f_take_tuning_record A callback function that takes a tuning record and does something - * with it. - * \param f_direct_dispatch A function that directly dispatches an IRModule to the given workload - * as result if available, skipping the database query. - */ - Optional Query(runtime::String task_name, IRModule mod, Target target, - Optional> dispatched, - FTakeTuningRecord f_take_tuning_record, - FDirectDispatch f_direct_dispatch = nullptr); - - static constexpr const char* _type_key = "meta_schedule.ApplyHistoryBest"; - TVM_DECLARE_FINAL_OBJECT_INFO(ApplyHistoryBestNode, runtime::Object); -}; - -/*! - * \brief Managed reference to ApplyHistoryBestNode - * \sa ApplyHistoryBestNode - */ -class ApplyHistoryBest : public runtime::ObjectRef { - public: - /*! - * \brief Constructor - * \param database The database to be queried from - * \param te_filter_func The filtering function for TE computation - * \param logging_func The logging function to use - */ - explicit ApplyHistoryBest(Database database, ApplyHistoryBestNode::FTEFilterFunc te_filter_func, - PackedFunc logging_func); - /*! - * \brief The current ApplyHistoryBest in the context - * \return The ApplyHistoryBest in the current scope. - */ - static Optional Current(); - - TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(ApplyHistoryBest, runtime::ObjectRef, - ApplyHistoryBestNode); - - protected: - friend class ApplyHistoryBestInternal; - /*! \brief Entering the scope of the context manager */ - void EnterWithScope(); - /*! \brief Exiting the scope of the context manager */ - void ExitWithScope(); -}; - -} // namespace meta_schedule -} // namespace tvm - -#endif // TVM_META_SCHEDULE_APPLY_HISTORY_BEST_H_ diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h index 1c260d9d748a..0e7f45d39332 100644 --- a/include/tvm/meta_schedule/database.h +++ b/include/tvm/meta_schedule/database.h @@ -203,6 +203,27 @@ class DatabaseNode : public runtime::Object { * \return The size of the database. */ virtual int64_t Size() = 0; + /*! + * \brief Query the best record of the given workload from the database. + * \param mod The IRModule to be searched for. + * \param target The target to be searched for. + * \return The best record of the given workload; NullOpt if not found. + */ + virtual Optional QueryTuningRecord(IRModule mod, Target target); + /*! + * \brief Query the best schedule of the given workload from the database. + * \param mod The IRModule to be searched for. + * \param target The target to be searched for. + * \return The schedule in the best schedule of the given workload; NullOpt if not found. + */ + virtual Optional QuerySchedule(IRModule mod, Target target); + /*! + * \brief Query the best IRModule of the given workload from the database. + * \param mod The IRModule to be searched for. + * \param target The target to be searched for. + * \return The IRModule in the best IRModule of the given workload; NullOpt if not found. + */ + virtual Optional QueryIRModule(IRModule mod, Target target); static constexpr const char* _type_key = "meta_schedule.Database"; TVM_DECLARE_BASE_OBJECT_INFO(DatabaseNode, runtime::Object); @@ -339,6 +360,13 @@ class Database : public runtime::ObjectRef { PyDatabaseNode::FGetTopK f_get_top_k, PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records, PyDatabaseNode::FSize f_size); + /*! \return The current Database in the scope. */ + static Optional Current(); + /*! \brief Entering the scope of the context manager */ + void EnterWithScope(); + /*! \brief Exiting the scope of the context manager */ + void ExitWithScope(); + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(Database, runtime::ObjectRef, DatabaseNode); }; diff --git a/include/tvm/meta_schedule/extracted_task.h b/include/tvm/meta_schedule/extracted_task.h index bce40e6b95f0..239bf0dc5777 100644 --- a/include/tvm/meta_schedule/extracted_task.h +++ b/include/tvm/meta_schedule/extracted_task.h @@ -76,26 +76,6 @@ class ExtractedTask : public runtime::ObjectRef { ExtractedTaskNode); }; -/*! - * \brief The default TE task filter - * \param args The input/output arguments of the TE compute graph - * \param constants Raw data for constant tensors in args. If the size of this array is N, the last - * N tensors in args will be treated as constant tensors. - * \return NullOpt if the task is filtered out, otherwise the task in PrimFunc - */ -Optional DefaultTaskFilter(const Array& args, - const Array& constants); - -/*! - * \brief The default TE task filter, with `te.extern` allowed - * \param args The input/output arguments of the TE compute graph - * \param constants Raw data for constant tensors in args. If the size of this array is N, the last - * N tensors in args will be treated as constant tensors. - * \return NullOpt if the task is filtered out, otherwise the task in PrimFunc - */ -Optional DefaultTaskFilterAllowExtern(const Array& args, - const Array& constants); - } // namespace meta_schedule } // namespace tvm diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py index fe747af7972c..2d84389f9de1 100644 --- a/python/tvm/auto_scheduler/testing/tune_relay.py +++ b/python/tvm/auto_scheduler/testing/tune_relay.py @@ -15,10 +15,10 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-docstring -from distutils.util import strtobool import argparse import json import os +from distutils.util import strtobool import tvm from tvm import auto_scheduler @@ -26,7 +26,7 @@ from tvm import relay from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc from tvm.meta_schedule.testing.relay_workload import get_network -from tvm.meta_schedule.testing.tune_utils import generate_input_data, create_timer +from tvm.meta_schedule.testing.tune_utils import create_timer, generate_input_data from tvm.meta_schedule.utils import cpu_count from tvm.support import describe @@ -170,53 +170,62 @@ def main(): ARGS.input_shape, cache_dir=ARGS.cache_dir, ) - input_info = {input_name: input_shape} + input_info = [ + { + "name": input_name, + "shape": input_shape, + "dtype": input_dtype, + }, + ] input_data = { - item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape + item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in input_info } - for input_name, input_shape in input_info.items(): - print(f" input_name : {input_name}") - print(f" input_shape: {input_shape}") - print(f" input_dtype: {input_dtype}") + for item in input_info: + print(f" input_name : {item['name']}") + print(f" input_shape: {item['shape']}") + print(f" input_dtype: {item['dtype']}") with ms.Profiler() as profiler: - tasks, task_weights = auto_scheduler.extract_tasks( - mod["main"], - params, - target=ARGS.target, - hardware_params=hardware_params, - ) - for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): - print( - f"==== Task {idx}: {task.desc} " - f"(weight {task_weight} key: {task.workload_key}) =====" - ) - print(task.compute_dag) - - if ARGS.num_trials > 0: - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) - tuner.tune( - auto_scheduler.TuningOptions( - num_measure_trials=ARGS.num_trials, - runner=runner, - measure_callbacks=[ - auto_scheduler.RecordToFile(log_file), - ], - ), - adaptive_training=ARGS.adaptive_training, + with ms.Profiler.timeit("TaskExtraction"): + tasks, task_weights = auto_scheduler.extract_tasks( + mod["main"], + params, + target=ARGS.target, + hardware_params=hardware_params, ) + for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)): + print( + f"==== Task {idx}: {task.desc} " + f"(weight {task_weight} key: {task.workload_key}) =====" + ) + print(task.compute_dag) + + with ms.Profiler.timeit("Tuning"): + if ARGS.num_trials > 0: + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tuner.tune( + auto_scheduler.TuningOptions( + num_measure_trials=ARGS.num_trials, + runner=runner, + measure_callbacks=[ + auto_scheduler.RecordToFile(log_file), + ], + ), + adaptive_training=ARGS.adaptive_training, + ) relay_build = {"graph": relay.build, "vm": relay.vm.compile}[ARGS.backend] - with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext( - opt_level=3, - config={"relay.backend.use_auto_scheduler": True}, - ): - lib = relay_build( - mod, - target=ARGS.target, - params=params, - ) + with ms.Profiler.timeit("PostTuningCompilation"): + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext( + opt_level=3, + config={"relay.backend.use_auto_scheduler": True}, + ): + lib = relay_build( + mod, + target=ARGS.target, + params=params, + ) print("Tuning Time:") print(profiler.table()) diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py index f60d0a5490f5..cf348d49f4e2 100644 --- a/python/tvm/meta_schedule/__init__.py +++ b/python/tvm/meta_schedule/__init__.py @@ -30,7 +30,6 @@ search_strategy, space_generator, ) -from .apply_history_best import ApplyHistoryBest from .extracted_task import ExtractedTask from .profiler import Profiler from .relay_integration import ( diff --git a/python/tvm/meta_schedule/apply_history_best.py b/python/tvm/meta_schedule/apply_history_best.py deleted file mode 100644 index a7b9b20bf244..000000000000 --- a/python/tvm/meta_schedule/apply_history_best.py +++ /dev/null @@ -1,130 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""A context manager that injects the best tuning record in the database into compilation""" -import logging -from typing import Callable, List, Optional, Union - -from tvm._ffi import get_global_func, register_object -from tvm.ir import IRModule -from tvm.runtime import Object -from tvm.target import Target -from tvm.te import Tensor -from tvm.tir import PrimFunc - -from . import _ffi_api -from .database import Database, TuningRecord -from .utils import make_logging_func - -logger = logging.getLogger(__name__) # pylint: disable=invalid-name - - -@register_object("meta_schedule.ApplyHistoryBest") -class ApplyHistoryBest(Object): - """An integration context that allows application of historically best records from a database - - Parameters - ---------- - database : Database - The database to be queried from - te_filter_func : Union[str, None, Callable[[List[Tensor], List[NDArray]], PrimFunc]] = None - The filtering function for TE computation - If it's a string, it's the name of the filtering function. Built in functions are - - "meta_schedule.DefaultTaskFilter" - - "meta_schedule.DefaultTaskFilterAllowExtern" - If it's None, it's the default filtering function - If it's a callable, it's the filtering function - """ - - database: Database - - def __init__( - self, - database: Database, - te_filter_func: Union[str, None, Callable[[List[Tensor]], PrimFunc]] = None, - ) -> None: - if isinstance(te_filter_func, str): - te_filter_func = get_global_func(te_filter_func) - self.__init_handle_by_constructor__( - _ffi_api.ApplyHistoryBest, # type: ignore # pylint: disable=no-member - database, - te_filter_func, - make_logging_func(logger), - ) - - def query( - self, - task_name: str, - mod: IRModule, - target: Target, - dispatched: Optional[List[IRModule]], - f_take_tuning_record: Optional[Callable[[TuningRecord], None]] = None, - f_direct_dispatch: Optional[Callable[[IRModule], Optional[IRModule]]] = None, - ) -> Union[IRModule, None]: - """The entry point of the integration - - Parameters - ---------- - task_name : str - The name of the task extracted - mod : IRModule - The high-level IR - target: Target - Target Info - dispatched : Optional[List[IRModule]] - A list of low-level IRs that the high-level IR could potentially dispatch to - f_take_tuning_record : Optional[Callable[[TuningRecord], None]] = None - A callback function that takes a tuning record and does something with it - f_direct_dispatch : Optional[Callable[[IRModule], Optional[IRModule]]] = None - A function that directly dispatches an IRModule to the given workload as result if - available, skipping the database query. - - Returns - ------- - result : IRModule or None - Currently we only have to return tir::PrimFunc, but we wrap it under IRModule for - more general future use. None is returned if there is no feedback hint. - """ - return _ffi_api.ApplyHistoryBestQuery( # type: ignore # pylint: disable=no-member - self, - task_name, - mod, - target, - dispatched, - f_take_tuning_record, - f_direct_dispatch, - ) - - @staticmethod - def current() -> Optional["ApplyHistoryBest"]: - """The context manager in the current scope - - Returns - ------- - ctx : Optional[ApplyHistoryBest] - The ApplyHistoryBest context manager in the current scope. - None if it's currently not under any ApplyHistoryBest context. - """ - return _ffi_api.ApplyHistoryBestCurrent() # type: ignore # pylint: disable=no-member - - def __enter__(self) -> "ApplyHistoryBest": - """Entering the scope of the context manager""" - _ffi_api.ApplyHistoryBestEnterScope(self) # type: ignore # pylint: disable=no-member - return self - - def __exit__(self, ptype, value, trace) -> None: - """Exiting the scope of the context manager""" - _ffi_api.ApplyHistoryBestExitScope(self) # type: ignore # pylint: disable=no-member diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py index 0c11f77591cc..68283b4554e5 100644 --- a/python/tvm/meta_schedule/database/database.py +++ b/python/tvm/meta_schedule/database/database.py @@ -15,13 +15,14 @@ # specific language governing permissions and limitations # under the License. """TuningRecord database""" -from typing import Any, Callable, List, Optional +from typing import Any, Callable, List, Optional, Union from tvm._ffi import register_object from tvm.ir.module import IRModule from tvm.runtime import Object from tvm.target import Target -from tvm.tir.schedule import Trace +from tvm.tir.schedule import Schedule, Trace +from typing_extensions import Literal # pylint: disable=wrong-import-order from .. import _ffi_api from ..arg_info import ArgInfo @@ -234,6 +235,105 @@ def __len__(self) -> int: """ return _ffi_api.DatabaseSize(self) # type: ignore # pylint: disable=no-member + def query_tuning_record(self, mod: IRModule, target: Target) -> Optional[TuningRecord]: + """Query the best record of the given workload from the database. + + Parameters + ---------- + mod : IRModule + The IRModule to be searched for. + target : Target + The target to be searched for. + + Returns + ------- + tuning_record : Optional[TuningRecord] + The best record of the given workload; None if not found. + """ + return _ffi_api.DatabaseQueryTuningRecord(self, mod, target) # type: ignore # pylint: disable=no-member + + def query_schedule(self, mod: IRModule, target: Target) -> Optional[Schedule]: + """Query the best schedule of the given workload from the database. + + Parameters + ---------- + mod : IRModule + The IRModule to be searched for. + target : Target + The target to be searched for. + + Returns + ------- + schedule : Optional[Schedule] + The best schedule of the given workload; None if not found. + """ + return _ffi_api.DatabaseQuerySchedule(self, mod, target) # type: ignore # pylint: disable=no-member + + def query_ir_module(self, mod: IRModule, target: Target) -> Optional[IRModule]: + """Query the best IRModule of the given workload from the database. + + Parameters + ---------- + mod : IRModule + The IRModule to be searched for. + target : Target + The target to be searched for. + + Returns + ------- + ir_module : Optional[IRModule] + The best IRModule of the given workload; None if not found. + """ + return _ffi_api.DatabaseQueryIRModule(self, mod, target) # type: ignore # pylint: disable=no-member + + def query( + self, + mod: IRModule, + target: Target, + kind: Union[ + Literal["schedule"], + Literal["record"], + Literal["ir_module"], + ] = "schedule", + ) -> Union[Schedule, IRModule, TuningRecord]: + """Query the database to retrieve the best optimization outcome of the given workload. + + Parameters + ---------- + mod : IRModule + The IRModule to be searched for. + target : Target + The target to be searched for. + kind : str = "schedule" | "record" | "ir_module" + The kind of the optimization outcome to be returned. + + Returns + ------- + result : Union[Schedule, IRModule, TuningRecord] + The best optimization outcome of the given workload. + """ + if kind == "schedule": + return self.query_schedule(mod, target) + if kind == "record": + return self.query_tuning_record(mod, target) + if kind == "ir_module": + return self.query_ir_module(mod, target) + raise ValueError(f'Unknown kind: {kind}. Candidates are: "schedule", "record", "ir_module"') + + def __enter__(self) -> "Database": + """Entering the scope of the context manager""" + _ffi_api.DatabaseEnterWithScope(self) # type: ignore # pylint: disable=no-member + return self + + def __exit__(self, ptype, value, trace) -> None: + """Exiting the scope of the context manager""" + _ffi_api.DatabaseExitWithScope(self) # type: ignore # pylint: disable=no-member + + @staticmethod + def current() -> Optional["Database"]: + """Get the current database under scope.""" + return _ffi_api.DatabaseCurrent() # type: ignore # pylint: disable=no-member + @register_object("meta_schedule.PyDatabase") class _PyDatabase(Database): diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py index 97cbfc58a6c1..652f09261b2f 100644 --- a/python/tvm/meta_schedule/default_config.py +++ b/python/tvm/meta_schedule/default_config.py @@ -20,7 +20,6 @@ from os import path as osp from typing import Callable, Dict, List, Optional, Union -from tvm._ffi.registry import register_func from tvm.ir import IRModule from tvm.target import Target from tvm.tir import PrimFunc @@ -44,7 +43,6 @@ FnMutatorProb = Callable[[], Dict[Mutator, float]] -@register_func("tvm.meta_schedule.tune.parse_mod") # for use in ApplyHistoryBest def mod(mod: Union[PrimFunc, IRModule]) -> IRModule: # pylint: disable=redefined-outer-name """Normalize the input to an IRModule""" if isinstance(mod, PrimFunc): @@ -53,8 +51,6 @@ def mod(mod: Union[PrimFunc, IRModule]) -> IRModule: # pylint: disable=redefine mod = IRModule({"main": mod}) if not isinstance(mod, IRModule): raise TypeError(f"Expected `mod` to be PrimFunc or IRModule, but gets: {mod}") - # in order to make sure the mod can be found in ApplyHistoryBest - # different func name can cause structural unequal func_names = mod.get_global_vars() (func_name,) = func_names if len(func_names) == 1 and func_name != "main": diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py index d3b3ea796532..24009ab07fcf 100644 --- a/python/tvm/meta_schedule/relay_integration.py +++ b/python/tvm/meta_schedule/relay_integration.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. """MetaSchedule-Relay integration""" -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional import numpy as np # type: ignore from tvm import nd @@ -23,8 +23,6 @@ from tvm.ir import IRModule, transform from tvm.runtime import NDArray from tvm.target import Target -from tvm.te import Tensor -from tvm.tir import PrimFunc from .extracted_task import ExtractedTask from .utils import autotvm_silencer @@ -38,7 +36,7 @@ def extract_task_from_relay( opt_level: int = 3, pass_config: Optional[Dict[str, Any]] = None, disabled_pass: Optional[List[str]] = None, - te_filter_func: Union[str, None, Callable[[List[Tensor]], PrimFunc]] = None, + tir_converter: str = "default", ) -> List[ExtractedTask]: """Extract tuning tasks from a relay program. @@ -56,13 +54,13 @@ def extract_task_from_relay( The pass config of the compiler disabled_pass : Optional[List[str]] The list of disabled passes of the compiler - te_filter_func : Callable[[List[tvm.te.Tensor], List[NDArray]], bool] - The filter function to filter out the extracted tasks - If it's a string, it's the name of the filtering function. Built in functions are - - "meta_schedule.DefaultTaskFilter" - - "meta_schedule.DefaultTaskFilterAllowExtern" - If it's None, it's the default filtering function - If it's a callable, it's the filtering function + tir_converter : str + The filter function to filter out the extracted tasks. Builtin filters: + - "default" + - "allow_extern" + The converter is a PackedFunc registered as f"relay.backend.tir_converter.{tir_converter}", + with the signature below: + (args: List[te.Tensor], constants: List[NDArray]) -> Optional[tir.PrimFunc] Returns ------- @@ -75,8 +73,6 @@ def extract_task_from_relay( # pylint: enable=import-outside-toplevel - if isinstance(te_filter_func, str): - te_filter_func = get_global_func(te_filter_func) extract_task_func = get_global_func( "relay.backend.MetaScheduleExtractTask", allow_missing=False, @@ -89,7 +85,10 @@ def extract_task_from_relay( if disabled_pass is None: disabled_pass = [] if pass_config is None: - pass_config = {"relay.backend.use_meta_schedule": True} + pass_config = { + "relay.backend.use_meta_schedule": True, + "relay.backend.tir_converter": tir_converter, + } if params is None: params = {} relay_params = {} @@ -110,7 +109,7 @@ def extract_task_from_relay( else: tophub_context = autotvm.utils.EmptyContext() with tophub_context: - return list(extract_task_func(mod, target, relay_params, te_filter_func)) + return list(extract_task_func(mod, target, relay_params)) def is_meta_schedule_enabled() -> bool: diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py index 8010e36fd656..596a5a736333 100644 --- a/python/tvm/meta_schedule/testing/tune_relay.py +++ b/python/tvm/meta_schedule/testing/tune_relay.py @@ -15,16 +15,18 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-docstring -from distutils.util import strtobool import argparse import json import logging +from distutils.util import strtobool +from typing import Dict +import numpy as np # type: ignore import tvm from tvm import meta_schedule as ms from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc from tvm.meta_schedule.testing.relay_workload import get_network -from tvm.meta_schedule.testing.tune_utils import generate_input_data, create_timer +from tvm.meta_schedule.testing.tune_utils import create_timer, generate_input_data from tvm.support import describe @@ -137,14 +139,24 @@ def main(): ARGS.input_shape, cache_dir=ARGS.cache_dir, ) - input_info = {input_name: input_shape} - input_data = { - item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape + input_info = [ + { + "name": input_name, + "shape": input_shape, + "dtype": input_dtype, + }, + ] + input_data: Dict[str, np.ndarray] = { + item["name"]: generate_input_data( # type: ignore + item["shape"], # type: ignore + item["dtype"], # type: ignore + ) + for item in input_info } - for input_name, input_shape in input_info.items(): - print(f" input_name : {input_name}") - print(f" input_shape: {input_shape}") - print(f" input_dtype: {input_dtype}") + for item in input_info: + print(f" input_name : {item['name']}") + print(f" input_shape: {item['shape']}") + print(f" input_dtype: {item['dtype']}") runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py index dda492008ffe..5919fb47c809 100644 --- a/python/tvm/meta_schedule/testing/utils.py +++ b/python/tvm/meta_schedule/testing/utils.py @@ -16,12 +16,13 @@ # under the License. """Testing utility functions in meta schedule""" from typing import Callable, Dict, Optional, Union + +from tvm import meta_schedule as ms from tvm.ir import IRModule, transform from tvm.relay import Function as RelayFunc from tvm.runtime import NDArray from tvm.target import Target from tvm.tir import Schedule -from tvm import meta_schedule as ms def apply_fixed_schedules( @@ -29,10 +30,10 @@ def apply_fixed_schedules( target: Union[str, Target], params: Optional[Dict[str, NDArray]], schedule_fn: Callable[[ms.ExtractedTask, Schedule], bool], - te_filter_func=None, + tir_converter: str = "default", ): """Apply fixed schedules (manually written, without any tunable knobs) as specified by - schedule_fn to extracted tasks, and return a database that can be passed to ApplyHistoryBest. + schedule_fn to extracted tasks, and return a database that can be passed to compilation. Parameters ---------- @@ -45,13 +46,13 @@ def apply_fixed_schedules( schedule_fn : Callable[[ExtractedTask, Schedule], bool] A callable that is applied for each extracted task and the corresponding default schedule. Returns True if the given schedule should be committed to the database, False otherwise. - te_filter_func : Union[str, None, Callable[[List[Tensor], List[NDArray]], PrimFunc]] = None - The filtering function for TE computation - If it's a string, it's the name of the filtering function. Built in functions are - - "meta_schedule.DefaultTaskFilter" - - "meta_schedule.DefaultTaskFilterAllowExtern" - If it's None, it's the default filtering function - If it's a callable, it's the filtering function + tir_converter : str + The filter function to filter out the extracted tasks. Builtin filters: + - "default" + - "allow_extern" + The converter is a PackedFunc registered as f"relay.backend.tir_converter.{tir_converter}", + with the signature below: + (args: List[te.Tensor], constants: List[NDArray]) -> Optional[tir.PrimFunc] Returns ------- @@ -64,7 +65,10 @@ def apply_fixed_schedules( config[k] = v extracted_tasks = ms.extract_task_from_relay( - relay_mod, target, params, te_filter_func=te_filter_func, pass_config=config + relay_mod, + target, + params, + tir_converter=tir_converter, ) database = ms.database.MemoryDatabase() for task in extracted_tasks: diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py index 447fb56637ef..20eccc30a113 100644 --- a/python/tvm/meta_schedule/tune.py +++ b/python/tvm/meta_schedule/tune.py @@ -24,14 +24,12 @@ from tvm.ir import IRModule from tvm.ir.transform import PassContext -from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply from tvm.runtime import Module, NDArray, vm from tvm.target import Target from tvm.te import Tensor, create_prim_func from tvm.tir import PrimFunc, Schedule from . import default_config -from .apply_history_best import ApplyHistoryBest from .builder import Builder from .cost_model import CostModel from .database import Database, TuningRecord @@ -43,7 +41,7 @@ from .runner import Runner from .schedule_rule import ScheduleRule from .search_strategy import EvolutionarySearch, ReplayFunc, ReplayTrace -from .space_generator import SpaceGenerator +from .space_generator import PostOrderApply, SpaceGenerator from .task_scheduler import GradientBased, RoundRobin from .tune_context import TuneContext from .utils import autotvm_silencer, batch_parameterize_config @@ -461,7 +459,7 @@ def _f_block_filter(block, target_names) -> bool: mutator_probs=mutator_probs, num_threads=num_threads, ) - with Profiler.timeit("ApplyHistoryBest"): + with Profiler.timeit("PostTuningCompilation"): bests: List[TuningRecord] = database.get_top_k(database.commit_workload(mod), top_k=1) if not bests: return None @@ -591,6 +589,7 @@ def tune_relay( """ # pylint: disable=import-outside-toplevel from tvm import relay + from .relay_integration import extract_task_from_relay # pylint: disable=protected-access, enable=import-outside-toplevel @@ -615,13 +614,14 @@ def tune_relay( num_threads=num_threads, ) relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend] - with Profiler.timeit("ApplyHistoryBest"): - with target, autotvm_silencer(), ApplyHistoryBest(database): + with Profiler.timeit("PostTuningCompilation"): + with target, autotvm_silencer(), database: with PassContext( opt_level=3, config={ "relay.backend.use_meta_schedule": True, "relay.backend.use_meta_schedule_dispatch": target.kind.name != "cuda", + "relay.backend.tir_converter": "default", }, ): return relay_build(mod, target=target, params=params) diff --git a/src/meta_schedule/apply_history_best.cc b/src/meta_schedule/apply_history_best.cc deleted file mode 100644 index 62db29306777..000000000000 --- a/src/meta_schedule/apply_history_best.cc +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -#include - -#include "./utils.h" - -namespace tvm { -namespace meta_schedule { - -/**************** Utility functions ****************/ - -template -Optional GetOnlyOneFunctionCommon(const IRModule& mod, Callback on_found) { - if (mod->functions.size() != 1) { - return NullOpt; - } - for (const auto& kv : mod->functions) { - const BaseFunc& func = kv.second; - if (!func->IsInstance()) { - return NullOpt; - } else { - return on_found(kv); - } - } - return NullOpt; -} - -template -Optional GetOnlyOneFunctionKey(const IRModule& mod) { - return GetOnlyOneFunctionCommon(mod, [](auto kv) { return kv.first; }); -} - -template -Optional GetOnlyOneFunction(const IRModule& mod) { - return GetOnlyOneFunctionCommon( - mod, [](auto kv) { return Downcast(kv.second); }); -} - -template -bool HasOnlyOneFunction(const IRModule& mod) { - return GetOnlyOneFunction(mod).defined(); -} - -/**************** Context Manager ****************/ - -class ApplyHistoryBestInternal { - public: - static void EnterScope(ApplyHistoryBest ctx) { ctx.EnterWithScope(); } - static void ExitScope(ApplyHistoryBest ctx) { ctx.ExitWithScope(); } -}; - -struct ApplyHistoryBestThreadLocalEntry { - Optional ctx; -}; - -using ApplyHistoryBestThreadLocalStore = dmlc::ThreadLocalStore; - -Optional ApplyHistoryBest::Current() { - return ApplyHistoryBestThreadLocalStore::Get()->ctx; -} - -void ApplyHistoryBest::EnterWithScope() { - Optional& ctx = ApplyHistoryBestThreadLocalStore::Get()->ctx; - CHECK(!ctx.defined()) << "ValueError: Nested ApplyHistoryBest context managers are not allowed"; - ctx = *this; -} - -void ApplyHistoryBest::ExitWithScope() { - Optional& ctx = ApplyHistoryBestThreadLocalStore::Get()->ctx; - ICHECK(ctx.defined()); - ctx = NullOpt; -} - -/**************** ApplyHistoryBest ****************/ - -ApplyHistoryBest::ApplyHistoryBest(Database database, - ApplyHistoryBestNode::FTEFilterFunc te_filter_func, - PackedFunc logging_func) { - ObjectPtr n = make_object(); - n->database = database; - n->te_filter_func = te_filter_func; - n->logging_func = logging_func; - if (te_filter_func == nullptr) { - n->te_filter_func = DefaultTaskFilter; - } - data_ = n; -} - -Optional ApplyHistoryBestNode::Query(runtime::String task_name, IRModule mod, - Target target, Optional> dispatched, - FTakeTuningRecord f_take_tuning_record, - FDirectDispatch f_direct_dispatch) { - ICHECK(dispatched.defined()); - ICHECK_EQ(dispatched.value().size(), 1); - ICHECK(HasOnlyOneFunction(mod)) << mod; - IRModule prim_mod = dispatched.value()[0]; - ICHECK(HasOnlyOneFunction(prim_mod)) << prim_mod; - - // Keep the original func name to be returned later. - GlobalVar gv = GetOnlyOneFunctionKey(prim_mod).value(); - - // Unify func name to make sure it can be found in database - const auto* parse_mod_func = runtime::Registry::Get("tvm.meta_schedule.tune.parse_mod"); - ICHECK(parse_mod_func) << "Parse mod function not defined!"; - prim_mod = (*parse_mod_func)(prim_mod); - - if (f_direct_dispatch != nullptr) { - Optional mod = f_direct_dispatch(prim_mod); - if (mod.defined()) { - TVM_PY_LOG(INFO, logging_func) << "Direct dispatch applied for workload: " << task_name; - return mod.value(); - } - } - if (database->HasWorkload(prim_mod)) { - Array records = database->GetTopK(database->CommitWorkload(prim_mod), 1); - if (records.size() == 1) { - if (f_take_tuning_record != nullptr) { - f_take_tuning_record(records[0]); - } - tir::Schedule sch = - tir::Schedule::Traced(records[0]->workload->mod, /*seed=*/-1, /*debug_mask=*/0, - /*error_render_level=*/tir::ScheduleErrorRenderLevel::kNone); - records[0]->trace->ApplyToSchedule(sch, false); - tir::PrimFunc func = GetOnlyOneFunction(sch->mod()).value(); - // Make sure we return the updated PrimFunc paired with the original func name. - return IRModule({{gv, func}}); - } - } - TVM_PY_LOG(WARNING, logging_func) << "Cannot find workload: " << task_name; - return NullOpt; -} - -TVM_REGISTER_NODE_TYPE(ApplyHistoryBestNode); -TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBest") - .set_body_typed([](Database database, ApplyHistoryBestNode::FTEFilterFunc te_filter_func, - PackedFunc logging_func) -> ApplyHistoryBest { - return ApplyHistoryBest(database, te_filter_func, logging_func); - }); -TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestEnterScope") - .set_body_typed(ApplyHistoryBestInternal::EnterScope); -TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestExitScope") - .set_body_typed(ApplyHistoryBestInternal::ExitScope); -TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestCurrent") - .set_body_typed(ApplyHistoryBest::Current); -TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestQuery") - .set_body_method(&ApplyHistoryBestNode::Query); - -} // namespace meta_schedule -} // namespace tvm diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc index 4e180c4fab61..fedd2aa35278 100644 --- a/src/meta_schedule/database/database.cc +++ b/src/meta_schedule/database/database.cc @@ -154,6 +154,59 @@ TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& w return TuningRecord(trace, workload, run_secs, target, args_info); } +/******** Database ********/ + +Optional DatabaseNode::QueryTuningRecord(IRModule mod, Target target) { + if (!this->HasWorkload(mod)) { + return NullOpt; + } + Array records = this->GetTopK(this->CommitWorkload(mod), 1); + if (records.empty()) { + return NullOpt; + } + ICHECK_EQ(records.size(), 1); + return records[0]; +} + +Optional DatabaseNode::QuerySchedule(IRModule mod, Target target) { + if (Optional opt_record = this->QueryTuningRecord(mod, target)) { + TuningRecord record = opt_record.value(); + tir::Schedule sch = + tir::Schedule::Traced(record->workload->mod, /*seed=*/-1, /*debug_mask=*/0, + /*error_render_level=*/tir::ScheduleErrorRenderLevel::kDetail); + record->trace->ApplyToSchedule(sch, false); + return sch; + } else { + return NullOpt; + } +} + +Optional DatabaseNode::QueryIRModule(IRModule mod, Target target) { + if (Optional opt_sch = this->QuerySchedule(mod, target)) { + return opt_sch.value()->mod(); + } else { + return NullOpt; + } +} + +std::vector* ThreadLocalDatabases() { + static thread_local std::vector tls; + return &tls; +} + +void Database::EnterWithScope() { ThreadLocalDatabases()->push_back(*this); } + +void Database::ExitWithScope() { ThreadLocalDatabases()->pop_back(); } + +Optional Database::Current() { + std::vector* tls = ThreadLocalDatabases(); + if (tls->empty()) { + return NullOpt; + } else { + return tls->back(); + } +} + /******** PyDatabase ********/ Database Database::PyDatabase(PyDatabaseNode::FHasWorkload f_has_workload, @@ -194,6 +247,11 @@ TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsMeasureCandidate") TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsJSON") .set_body_method(&TuningRecordNode::AsJSON); TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordFromJSON").set_body_typed(TuningRecord::FromJSON); +TVM_REGISTER_GLOBAL("meta_schedule.DatabaseEnterWithScope") + .set_body_method(&Database::EnterWithScope); +TVM_REGISTER_GLOBAL("meta_schedule.DatabaseExitWithScope") + .set_body_method(&Database::ExitWithScope); +TVM_REGISTER_GLOBAL("meta_schedule.DatabaseCurrent").set_body_typed(Database::Current); TVM_REGISTER_GLOBAL("meta_schedule.DatabaseHasWorkload") .set_body_method(&DatabaseNode::HasWorkload); TVM_REGISTER_GLOBAL("meta_schedule.DatabaseCommitWorkload") @@ -205,6 +263,12 @@ TVM_REGISTER_GLOBAL("meta_schedule.DatabaseGetTopK") TVM_REGISTER_GLOBAL("meta_schedule.DatabaseGetAllTuningRecords") .set_body_method(&DatabaseNode::GetAllTuningRecords); TVM_REGISTER_GLOBAL("meta_schedule.DatabaseSize").set_body_method(&DatabaseNode::Size); +TVM_REGISTER_GLOBAL("meta_schedule.DatabaseQueryTuningRecord") + .set_body_method(&DatabaseNode::QueryTuningRecord); +TVM_REGISTER_GLOBAL("meta_schedule.DatabaseQuerySchedule") + .set_body_method(&DatabaseNode::QuerySchedule); +TVM_REGISTER_GLOBAL("meta_schedule.DatabaseQueryIRModule") + .set_body_method(&DatabaseNode::QueryIRModule); TVM_REGISTER_GLOBAL("meta_schedule.DatabasePyDatabase").set_body_typed(Database::PyDatabase); } // namespace meta_schedule diff --git a/src/meta_schedule/extracted_task.cc b/src/meta_schedule/extracted_task.cc index 3406f82eb1f0..ec04361f51ec 100644 --- a/src/meta_schedule/extracted_task.cc +++ b/src/meta_schedule/extracted_task.cc @@ -38,67 +38,6 @@ ExtractedTask::ExtractedTask(String task_name, IRModule mod, Target target, data_ = n; } -Optional DefaultTaskFilterImpl(const Array& args, - const Array& constants, - bool allow_extern_op) { - using namespace ::tvm::te; - std::vector stack; - std::unordered_set visited; - for (const Tensor& v : args) { - for (const PrimExpr& e : v->shape) { - // Dynamic shape is not supported for now - if (!e->IsInstance()) { - return NullOpt; - } - } - if (!visited.count(v.get())) { - visited.insert(v.get()); - stack.push_back(v); - } - } - while (!stack.empty()) { - Tensor tensor = stack.back(); - stack.pop_back(); - if (tensor->op->IsInstance()) { - // do nothing - } else if (tensor->op->IsInstance() || - (allow_extern_op && tensor->op->IsInstance())) { - Array inputs = tensor->op->InputTensors(); - for (const Tensor& v : inputs) { - if (!visited.count(v.get())) { - visited.insert(v.get()); - stack.push_back(v); - } - } - } else { - return NullOpt; - } - } - PrimFunc func = te::CreatePrimFuncWithConstants(args, constants); - bool dynamic_loop_extent = false; - PostOrderVisit(func->body, [&dynamic_loop_extent](const ObjectRef& obj) -> void { - if (const auto* loop = obj.as()) { - if (!loop->extent->IsInstance()) { - dynamic_loop_extent = true; - } - } - }); - if (dynamic_loop_extent) { - return NullOpt; - } - return func; -} - -Optional DefaultTaskFilter(const Array& args, - const Array& constants) { - return DefaultTaskFilterImpl(args, constants, false); -} - -Optional DefaultTaskFilterAllowExtern(const Array& args, - const Array& constants) { - return DefaultTaskFilterImpl(args, constants, true); -} - TVM_REGISTER_NODE_TYPE(ExtractedTaskNode); TVM_REGISTER_GLOBAL("meta_schedule.ExtractedTask") .set_body_typed([](String task_name, IRModule mod, Target target, Array dispatched, @@ -106,14 +45,5 @@ TVM_REGISTER_GLOBAL("meta_schedule.ExtractedTask") return ExtractedTask(task_name, mod, target, dispatched, weight); }); -TVM_REGISTER_GLOBAL("meta_schedule.DefaultTaskFilter") - .set_body_typed([](const Array& args, const Array& constants) { - return DefaultTaskFilter(args, constants); - }); - -TVM_REGISTER_GLOBAL("meta_schedule.DefaultTaskFilterAllowExtern") - .set_body_typed([](const Array& args, const Array& constants) { - return DefaultTaskFilterAllowExtern(args, constants); - }); } // namespace meta_schedule } // namespace tvm diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h index 664a6a609e7f..db37935ec206 100644 --- a/src/meta_schedule/utils.h +++ b/src/meta_schedule/utils.h @@ -21,7 +21,6 @@ #include #include -#include #include #include #include diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc index 4f83b6eeed60..213841c621de 100644 --- a/src/relay/backend/task_extraction.cc +++ b/src/relay/backend/task_extraction.cc @@ -16,8 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - -#include #include #include #include @@ -32,13 +30,10 @@ namespace tvm { namespace relay { namespace backend { -Array ExtractTask( - IRModule mod, Target target, Map params, - meta_schedule::ApplyHistoryBestNode::FTEFilterFunc filter_func) { +Array ExtractTask(IRModule mod, Target target, + Map params) { using meta_schedule::ExtractedTask; - if (filter_func == nullptr) { - filter_func = tvm::meta_schedule::DefaultTaskFilter; - } + backend::FTECompilerTIRConverter tir_converter = backend::GetTIRConverter(); backend::BindParamsInModule(mod, params); // is_vm=true for backward compatibility Array pass_seqs = relay::backend::GetPassPrefix(/*is_homogenous=*/true, /*is_vm=*/true); @@ -48,7 +43,7 @@ Array ExtractTask( std::vector tasks; std::unordered_map cache; - PostOrderVisit(mod->Lookup("main"), [&target, &tasks, &cache, &filter_func](const Expr& exp) { + PostOrderVisit(mod->Lookup("main"), [&target, &tasks, &cache, &tir_converter](const Expr& exp) { if (exp->IsInstance()) { Function relay_func = Downcast(exp); if (!relay_func->HasNonzeroAttr(attr::kPrimitive)) { @@ -62,13 +57,11 @@ Array ExtractTask( } auto [inputs_outputs, constants, fused_name] = tec::LowerTECompute(relay_func, target, /*return_inputs=*/true); - if (Optional prim_func = filter_func(inputs_outputs, constants)) { - GlobalVar prim_fn_var(fused_name); - IRModule relay_mod({{prim_fn_var, relay_func}}); - IRModule tir_mod({{prim_fn_var, prim_func.value()}}); - ExtractedTask extracted_task(fused_name, relay_mod, target, {tir_mod}, 1); - tasks.push_back(extracted_task); - cache.emplace(cache_key, extracted_task); + if (Optional f = tir_converter(inputs_outputs, constants)) { + IRModule relay_mod({{GlobalVar(fused_name), relay_func}}); + ExtractedTask task(fused_name, relay_mod, target, {PrimFuncToIRModule(f.value())}, 1); + tasks.push_back(task); + cache.emplace(cache_key, task); } } }); diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc index 5c79ed2070cc..8fa8610c0fca 100644 --- a/src/relay/backend/te_compiler.cc +++ b/src/relay/backend/te_compiler.cc @@ -548,6 +548,7 @@ TECompiler& TECompiler::Global() { TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_auto_scheduler", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_meta_schedule", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_meta_schedule_dispatch", Bool); +TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.tir_converter", String); TVM_REGISTER_GLOBAL("relay.backend._TECompilerGlobal").set_body_typed([]() { return TECompiler::Global(); diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc index 92cc6f8cfa46..0e2a3e270257 100644 --- a/src/relay/backend/te_compiler_cache.cc +++ b/src/relay/backend/te_compiler_cache.cc @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include #include @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -61,16 +62,6 @@ TVM_REGISTER_NODE_TYPE(CachedFuncNode); TVM_REGISTER_NODE_TYPE(CCacheKeyNode); TVM_REGISTER_NODE_TYPE(CCacheValueNode); -void ExtractTransformLayout(const meta_schedule::TuningRecord& record) { - static tir::InstructionKind kind_transform_layout = tir::InstructionKind::Get("TransformLayout"); - for (const tir::Instruction& inst : record->trace->insts) { - if (inst->kind.same_as(kind_transform_layout)) { - ICHECK_EQ(inst->attrs.size(), 3); - relay::MetaScheduleLayoutRewriter::LayoutQueuePush(Downcast(inst->attrs[2])); - } - } -} - LoweredOutput::LoweredOutput(tvm::Array outputs, OpImplementation impl) { auto n = make_object(); n->outputs = std::move(outputs); @@ -317,11 +308,11 @@ class ScheduleBuilder : public ExprVisitor { // Whether to use auto_scheduler schedule. use_auto_scheduler_ = backend::IsAutoSchedulerEnabled(); if (backend::IsMetaScheduleEnabled()) { - meta_schedule_ctx_ = meta_schedule::ApplyHistoryBest::Current(); - CHECK(meta_schedule_ctx_.defined()) << "ValueError: `use_meta_schedule` is enabled in Relay " - "build, but no ApplyHistoryBest context is provided. "; + database_ = meta_schedule::Database::Current(); + CHECK(database_.defined()) << "ValueError: `use_meta_schedule` is enabled in Relay " + "build, but no `meta_schedule.Database` context is provided. "; } else { - meta_schedule_ctx_ = NullOpt; + database_ = NullOpt; } } @@ -359,32 +350,43 @@ class ScheduleBuilder : public ExprVisitor { schedule = Downcast(obj); } } - if (meta_schedule_ctx_) { + if (database_) { + using tvm::meta_schedule::TuningRecord; + using tvm::tir::IndexMap; + using tvm::tir::Instruction; + using tvm::tir::InstructionKind; + using tvm::tir::PrimFunc; + using tvm::tir::Schedule; + backend::FTECompilerTIRConverter tir_converter = backend::GetTIRConverter(); Array te_args = Concat(fn_inputs, tensor_outs); Array constants; for (auto [const_node, te_tensor] : lower_te_compute.constant_tensors_) { te_args.push_back(te_tensor); constants.push_back(const_node->data); } - - if (Optional tir_func = - meta_schedule_ctx_.value()->te_filter_func(te_args, constants)) { - IRModule relay_mod({{prim_fn_var, relay_func}}); - IRModule tir_mod({{prim_fn_var, tir_func.value()}}); - if (Optional opt_scheduled_mod = meta_schedule_ctx_.value()->Query( - /*task_name=*/prim_fn_var->name_hint, // - /*mod=*/relay_mod, // - /*target=*/target_, // - /*dispatched=*/Array{tir_mod}, // - /*f_take_tuning_record=*/ExtractTransformLayout)) { - IRModule scheduled_mod = - tir::transform::RemoveWeightLayoutRewriteBlock()(opt_scheduled_mod.value()); - ICHECK_EQ(scheduled_mod->functions.count(prim_fn_var), 1); - prim_func = Downcast(scheduled_mod->functions[prim_fn_var]); + if (Optional f = tir_converter(te_args, constants)) { + if (Optional opt_record = database_.value()->QueryTuningRecord( + /*mod=*/backend::PrimFuncToIRModule(f.value()), + /*target=*/target_)) { + static InstructionKind kind_transform_layout = InstructionKind::Get("TransformLayout"); + TuningRecord record = opt_record.value(); + for (const Instruction& inst : record->trace->insts) { + if (inst->kind.same_as(kind_transform_layout)) { + ICHECK_EQ(inst->attrs.size(), 3); + MetaScheduleLayoutRewriter::LayoutQueuePush(Downcast(inst->attrs[2])); + } + } + Schedule sch = Schedule::Traced(record->workload->mod, /*seed=*/-1, /*debug_mask=*/0, + tir::ScheduleErrorRenderLevel::kDetail); + record->trace->ApplyToSchedule(sch, /*remove_postproc=*/false); + IRModule mod = sch->mod(); + ICHECK_EQ(mod->functions.size(), 1); + mod = tir::transform::RemoveWeightLayoutRewriteBlock()(std::move(mod)); + prim_func = Downcast(mod->Lookup("main")); } } } - // Use TOPI schedule if user specificed, or the function has no auto_scheduler schedule. + // Use TOPI schedule if user specified, or the function has no auto_scheduler schedule. if (!schedule.defined() && !prim_func.defined()) { if (anchor_op_.defined()) { auto anchor_impl = lower_te_compute.op_implementations_.find(anchor_op_.operator->()); @@ -422,7 +424,7 @@ class ScheduleBuilder : public ExprVisitor { } int op_pattern = fpattern[op]; - if (!use_auto_scheduler_ && !meta_schedule_ctx_.defined() && op_pattern >= kCommReduce) { + if (!use_auto_scheduler_ && !database_.defined() && op_pattern >= kCommReduce) { ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce) << "Cannot apply TOPI schedule to a primitive function with two complicated ops" << " anchor=" << anchor_op_ << " current=" << op; @@ -440,7 +442,7 @@ class ScheduleBuilder : public ExprVisitor { Attrs anchor_attrs_; int anchor_op_pattern_{0}; bool use_auto_scheduler_; - Optional meta_schedule_ctx_; + Optional database_; }; /*! diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc index 340986770e93..5cf7a5563d19 100644 --- a/src/relay/backend/utils.cc +++ b/src/relay/backend/utils.cc @@ -28,6 +28,9 @@ #include #include #include +#include + +#include "../../te/operation/create_primfunc.h" namespace tvm { namespace relay { @@ -368,6 +371,76 @@ void BindParamsInModule(IRModule mod, Map params) { BindParamsInModule(mod, params_tmp); } +/*! + * \brief A default TE compute to TIR compute. + * \param args The inputs/outputs of the TE compute graph. + * \param constants The constants bound to TIR + * \param allow_extern_op Whether to allow extern operation in TE. + * \return The TIR converted; NullOpt if not supported (dynamic shape) + */ +Optional DefaultTIRConverterImpl(const Array& args, + const Array& constants, + bool allow_extern_op) { + using namespace ::tvm::te; + std::vector stack; + std::unordered_set visited; + for (const Tensor& v : args) { + for (const PrimExpr& e : v->shape) { + // Dynamic shape is not supported for now + if (!e->IsInstance()) { + return NullOpt; + } + } + if (!visited.count(v.get())) { + visited.insert(v.get()); + stack.push_back(v); + } + } + while (!stack.empty()) { + Tensor tensor = stack.back(); + stack.pop_back(); + if (tensor->op->IsInstance()) { + // do nothing + } else if (tensor->op->IsInstance() || + (allow_extern_op && tensor->op->IsInstance())) { + Array inputs = tensor->op->InputTensors(); + for (const Tensor& v : inputs) { + if (!visited.count(v.get())) { + visited.insert(v.get()); + stack.push_back(v); + } + } + } else { + return NullOpt; + } + } + PrimFunc func = te::CreatePrimFuncWithConstants(args, constants); + bool dynamic_loop_extent = false; + tir::PostOrderVisit(func->body, [&dynamic_loop_extent](const ObjectRef& obj) -> void { + if (const auto* loop = obj.as()) { + if (!loop->extent->IsInstance()) { + dynamic_loop_extent = true; + } + } + }); + if (dynamic_loop_extent) { + return NullOpt; + } + return func; +} + +TVM_REGISTER_GLOBAL("relay.backend.tir_converter.default") + .set_body_typed([](const Array& args, + const Array& constants) -> Optional { + return DefaultTIRConverterImpl(args, constants, false); + }); + +TVM_REGISTER_GLOBAL("relay.backend.tir_converter.allow_extern") + .set_body_typed([](const Array& args, + const Array& constants) -> Optional { + return DefaultTIRConverterImpl(args, constants, true); + }); + } // namespace backend } // namespace relay } // namespace tvm diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h index 57c066131181..37ae9d803a35 100644 --- a/src/relay/backend/utils.h +++ b/src/relay/backend/utils.h @@ -558,6 +558,37 @@ inline bool IsMetaScheduleEnabled() { .value(); } +/*! + * \brief Method in TECompiler to convert TE compute to scheduleable TIR + * \param args The arguments of the TE compute + * \param constants The constants used in AllocateConst + * \return NullOpt if conversion fails; Otherwise the converted TIR + * \note This method could be further used as a task filtering mechanism in task extraction + */ +using FTECompilerTIRConverter = runtime::TypedPackedFunc< // + Optional( // + const Array& args, // + const Array& constants)>; + +/*! \brief Return a task filter for AutoTIR according to `relay.backend.tir_converter` */ +inline FTECompilerTIRConverter GetTIRConverter() { + String name = transform::PassContext::Current() + ->GetConfig("relay.backend.tir_converter", "default") + .value(); + const PackedFunc* f = runtime::Registry::Get("relay.backend.tir_converter." + name); + ICHECK(f != nullptr) << "IndexError: Cannot find TIR converter: " << name; + return FTECompilerTIRConverter(*f); +} + +/*! \brief Converts a PrimFunc to IRModule. */ +inline IRModule PrimFuncToIRModule(tir::PrimFunc f) { + f = WithAttrs(f, Map{ + {tvm::attr::kGlobalSymbol, String("main")}, + {tvm::tir::attr::kNoAlias, Bool(1)}, + }); + return IRModule({{GlobalVar("main"), f}}); +} + /*! * \brief Get the sequence of Relay optimization passes based on backend type. * The prefix of the Relay passes almost overlaps between the vm and graph backend, with some slight diff --git a/tests/python/integration/test_meta_schedule_auto_tensorize.py b/tests/python/integration/test_meta_schedule_auto_tensorize.py index 3397eaabbef2..7227ef0c7b79 100644 --- a/tests/python/integration/test_meta_schedule_auto_tensorize.py +++ b/tests/python/integration/test_meta_schedule_auto_tensorize.py @@ -19,13 +19,12 @@ import numpy as np import pytest - import tvm import tvm.testing import tvm.topi.testing from tvm import meta_schedule as ms from tvm import relay -from tvm.meta_schedule import ApplyHistoryBest, postproc, schedule_rule +from tvm.meta_schedule import postproc, schedule_rule from tvm.meta_schedule.relay_integration import extract_task_from_relay from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base from tvm.meta_schedule.tune import tune_extracted_tasks @@ -176,12 +175,11 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos postprocs=lambda: postprocs, ) - with ApplyHistoryBest(database): - with tvm.transform.PassContext( - opt_level=3, - config={"relay.backend.use_meta_schedule": True}, - ): - lib = relay.build(relay_mod, target=target, params=params) + with database, tvm.transform.PassContext( + opt_level=3, + config={"relay.backend.use_meta_schedule": True}, + ): + lib = relay.build(relay_mod, target=target, params=params) if "cascadelake" in target: asm = lib.lib.get_source("asm") @@ -267,12 +265,11 @@ def _test_bert_int8(target, sch_rules, postprocs): postprocs=lambda: postprocs, ) - with ApplyHistoryBest(database): - with tvm.transform.PassContext( - opt_level=3, - config={"relay.backend.use_meta_schedule": True}, - ): - lib = relay.build(relay_mod, target=target, params=params) + with database, tvm.transform.PassContext( + opt_level=3, + config={"relay.backend.use_meta_schedule": True}, + ): + lib = relay.build(relay_mod, target=target, params=params) dev = tvm.device("cuda" if "nvidia" in target else target, 0) runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py index 8e299dc935d5..c741ecb59ae0 100644 --- a/tests/python/unittest/test_link_params.py +++ b/tests/python/unittest/test_link_params.py @@ -19,20 +19,18 @@ import json import os import re -from io import StringIO from contextlib import redirect_stderr +from io import StringIO import numpy as np - import tvm import tvm.relay import tvm.testing from tvm import meta_schedule as ms from tvm import relay -from tvm.relay.backend import Executor, Runtime from tvm.contrib import utils from tvm.meta_schedule.testing.utils import apply_fixed_schedules - +from tvm.relay.backend import Executor, Runtime INPUT_SHAPE = (1, 3, 16, 16) @@ -421,13 +419,12 @@ def schedule_fn(task, sch): database = apply_fixed_schedules(relay_mod, target, params, schedule_fn) with StringIO() as stderr_buf, redirect_stderr(stderr_buf): - with ms.ApplyHistoryBest(database): - with tvm.transform.PassContext( - opt_level=3, - config={"relay.backend.use_meta_schedule": True}, - ): - executor = Executor("graph", {"link-params": link_params}) - lib = relay.build(relay_mod, target=target, executor=executor) + with database, tvm.transform.PassContext( + opt_level=3, + config={"relay.backend.use_meta_schedule": True}, + ): + executor = Executor("graph", {"link-params": link_params}) + lib = relay.build(relay_mod, target=target, executor=executor) # Workload look up should succeed. This does not work when the test is invoked from pytest. assert not "Cannot find workload" in stderr_buf.getvalue() diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py index afce19a590e3..69522831ee55 100644 --- a/tests/python/unittest/test_meta_schedule_integration.py +++ b/tests/python/unittest/test_meta_schedule_integration.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. """Integration test for MetaSchedule""" -from typing import Optional import numpy as np import pytest import tvm @@ -23,11 +22,10 @@ from tvm import IRModule from tvm import meta_schedule as ms from tvm import relay, te, tir +from tvm._ffi import register_func from tvm.meta_schedule.testing.relay_workload import get_network from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base from tvm.script import tir as T -from tvm.target import Target -from tvm.tir import Schedule # pylint: disable=no-member,line-too-long,too-many-nested-blocks,unbalanced-tuple-unpacking,no-self-argument,missing-docstring,invalid-name @@ -58,10 +56,6 @@ def _has_torch(): requires_torch = pytest.mark.skipif(not _has_torch(), reason="torch is not installed") -def test_meta_schedule_apply_history_best_no_current(): - assert ms.ApplyHistoryBest.current() is None - - def test_meta_schedule_dynamic_loop_extent(): a = relay.var("a", shape=(1, 8, 8, 512), dtype="float32") b = relay.nn.adaptive_avg_pool2d(a, (7, 7), "NHWC") @@ -125,7 +119,7 @@ def test_meta_schedule_integration_extract_from_bert_base(): 12, [[64, 768], [3072, 768], [64, 3072]], ), - "fused_subtract_add_sqrt_divide_multiply_add": ( + "fused_subtract_add_rsqrt_multiply_multiply_add": ( 25, [[1, 64, 768], [1, 64, 1], [1, 64, 1], [768], [768], [1, 64, 768]], ), @@ -206,7 +200,8 @@ def test_meta_schedule_integration_extract_from_bert_base(): @requires_torch def test_meta_schedule_integration_extract_from_resnet_with_filter_func(): - def filter_func(args) -> bool: + @register_func("relay.backend.tir_converter.remove_purely_spatial", override=True) + def filter_func(args, _) -> bool: from tvm.te import create_prim_func # pylint: disable=import-outside-toplevel has_complex_op = False @@ -236,7 +231,7 @@ def traverse(t): mod, target="llvm", params=params, - te_filter_func=filter_func, + tir_converter="remove_purely_spatial", ) expected_task_names = [ "fused_" + s @@ -267,53 +262,6 @@ def traverse(t): assert t.task_name in expected_task_names, t.task_name -@requires_torch -def test_meta_schedule_integration_apply_history_best(): - mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224]) - database = ms.database.MemoryDatabase() - env = ms.ApplyHistoryBest(database) - target = Target("llvm") - workload = database.commit_workload(MockModule) - database.commit_tuning_record( - ms.database.TuningRecord( - trace=Schedule(MockModule).trace, - workload=workload, - run_secs=[1.0], - target=target, - args_info=[], - ) - ) - mod = env.query( - task_name="mock-task", - mod=mod, - target=target, - dispatched=[MockModule], - ) - assert tvm.ir.structural_equal(mod, workload.mod) - - -@requires_torch -def test_meta_schedule_integration_apply_history_best_direct_dispatch(): - def direct_dispatch(mod: IRModule) -> Optional[IRModule]: - if tvm.ir.structural_equal(mod, MockModule): - return MockModule - return None - - mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224]) - database = ms.database.MemoryDatabase() - env = ms.ApplyHistoryBest(database) - target = Target("llvm") - workload = database.commit_workload(MockModule) - mod = env.query( - task_name="mock-task-direct-dispatch", - mod=mod, - target=target, - dispatched=[MockModule], - f_direct_dispatch=direct_dispatch, - ) - assert tvm.ir.structural_equal(mod, workload.mod) - - @pytest.mark.skip("Too slow on CI") def extract_task_qbert(): mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128) diff --git a/tests/python/unittest/test_meta_schedule_multi_anchor.py b/tests/python/unittest/test_meta_schedule_multi_anchor.py index b7d012ca04d6..177001781179 100644 --- a/tests/python/unittest/test_meta_schedule_multi_anchor.py +++ b/tests/python/unittest/test_meta_schedule_multi_anchor.py @@ -70,7 +70,7 @@ def schedule_fn(task, sch): return False database = apply_fixed_schedules(relay_mod, target, params, schedule_fn) - with ms.ApplyHistoryBest(database): + with database: with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, diff --git a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py index 058012cb643a..939851a65731 100644 --- a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py +++ b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py @@ -19,7 +19,6 @@ import tvm.testing import tvm.topi.testing from tvm import autotvm, relay, te -from tvm.meta_schedule import ApplyHistoryBest from tvm.meta_schedule.testing.utils import apply_fixed_schedules from tvm.relay.testing.temp_op_attr import TempOpAttr from tvm.script import tir as T @@ -152,17 +151,16 @@ def schedule_fn(task, sch): target, params, schedule_fn, - te_filter_func="meta_schedule.DefaultTaskFilterAllowExtern", + tir_converter="allow_extern", ) - with ApplyHistoryBest( - database, - te_filter_func="meta_schedule.DefaultTaskFilterAllowExtern", + with database, tvm.transform.PassContext( + opt_level=3, + config={ + "relay.backend.use_meta_schedule": True, + "relay.backend.tir_converter": "allow_extern", + }, ): - with tvm.transform.PassContext( - opt_level=3, - config={"relay.backend.use_meta_schedule": True}, - ): - lib = relay.build(relay_mod, target=target, params=params) + lib = relay.build(relay_mod, target=target, params=params) dev = tvm.device(target, 0) diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py index 7d85b8757ae2..bc37fed7d691 100644 --- a/tests/python/unittest/test_meta_schedule_tune_relay.py +++ b/tests/python/unittest/test_meta_schedule_tune_relay.py @@ -245,12 +245,11 @@ def print_results(self) -> None: database.commit_workload(tvmgen_default_fused_layout_transform_1) database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc) - with ms.ApplyHistoryBest(database): - with tvm.transform.PassContext( - opt_level=3, - config={"relay.backend.use_meta_schedule": True}, - ): - rt_mod1 = relay.build(mod, target=target, params=params) + with database, tvm.transform.PassContext( + opt_level=3, + config={"relay.backend.use_meta_schedule": True}, + ): + rt_mod1 = relay.build(mod, target=target, params=params) # Compile without meta-schedule for correctness check with tvm.transform.PassContext(opt_level=0): @@ -307,12 +306,11 @@ def test_meta_schedule_relay_lowering(): args_info=[], ) ) - with ms.ApplyHistoryBest(database): - with tvm.transform.PassContext( - opt_level=3, - config={"relay.backend.use_meta_schedule": True}, - ): - rt_mod1 = relay.build(mod, target=target, params=params) + with database, tvm.transform.PassContext( + opt_level=3, + config={"relay.backend.use_meta_schedule": True}, + ): + rt_mod1 = relay.build(mod, target=target, params=params) # Compile without meta-schedule for correctness check with tvm.transform.PassContext(opt_level=0): @@ -472,24 +470,23 @@ def schedule_fn(task, sch): database = apply_fixed_schedules(relay_mod, target, params, schedule_fn) - with ms.ApplyHistoryBest(database): - with tvm.transform.PassContext( - opt_level=3, - config={"relay.backend.use_meta_schedule": True}, - ): - # pylint: disable=W0105 - """ - The log should say - Warning: Cannot find workload: tvmgen_default_fused_expand_dims - Warning: Cannot find workload: tvmgen_default_fused_cast - Warning: Cannot find workload: tvmgen_default_fused_cast_1 - Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul - - This means batch matmul and others are scheduled by TE, and dense (the one not warned) - is found in the meta schedule tuning database during ApplyHistoryBest - """ - # pylint: enable=W0105 - lib = relay.build(relay_mod, target=target, params=params) + with database, tvm.transform.PassContext( + opt_level=3, + config={"relay.backend.use_meta_schedule": True}, + ): + # pylint: disable=W0105 + """ + The log should say + Warning: Cannot find workload: tvmgen_default_fused_expand_dims + Warning: Cannot find workload: tvmgen_default_fused_cast + Warning: Cannot find workload: tvmgen_default_fused_cast_1 + Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul + + This means batch matmul and others are scheduled by TE, and dense (the one not warned) + is found in the meta schedule tuning database during compilation + """ + # pylint: enable=W0105 + lib = relay.build(relay_mod, target=target, params=params) runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) From 534412896e6d39ee4f830d63370d02e8e5f09050 Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Sat, 27 Aug 2022 11:14:58 -0700 Subject: [PATCH 062/704] [TIR] Expose MMA-related PTX builtins (#12623) Expose MMA-related PTX builtins This PR exposes the following TIR operation in python: `ptx_mma`: tested `ptx_mma_sp`: tested `mma_store`: add new unittest `mma_fill`: add new unittest Co-authored-by: yongwww Co-authored-by: yongwww --- python/tvm/tir/__init__.py | 1 + python/tvm/tir/op.py | 287 +++++++++++++++++++++ tests/python/unittest/test_tir_op_types.py | 75 ++++++ 3 files changed, 363 insertions(+) diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py index 4a6f32d03a2b..8e637d2d6564 100644 --- a/python/tvm/tir/__init__.py +++ b/python/tvm/tir/__init__.py @@ -59,6 +59,7 @@ tvm_bmma_sync, tvm_fill_fragment, ) +from .op import ptx_mma, ptx_mma_sp, mma_store, mma_fill from .op import ptx_ldmatrix, ptx_cp_async, ptx_commit_group, ptx_wait_group from .op import vectorlow, vectorhigh, vectorcombine from .op import infinity, reinterpret diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py index e510f68a68a1..1fd3050c0a7f 100644 --- a/python/tvm/tir/op.py +++ b/python/tvm/tir/op.py @@ -831,6 +831,293 @@ def tvm_store_matrix_sync(fragment, m, n, k, index, buffer_ptr, stride, layout): ) +def ptx_mma( + dtype, + shape, + A_layout, + B_layout, + A_dtype, + B_dtype, + C_dtype, + multiplicand_a, + a_index, + multiplicand_b, + b_index, + accumulator, + c_index, + saturate, + operator=None, +): + """TVM intrinsic for ptx tensor core mma instructions + https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-mma + + Parameters + ---------- + dtype : str + The data type of the result. + + shape : str + The shape of mma fragment. + + A_layout : Literal["row", "col"] + The layout of multiplicand fragment A. + + B_layout : Literal["row", "col"] + The layout of multiplicand fragment B. + + A_dtype : str + The data type of multiplicand fragment A. + + B_dtype : str + The data type of multiplicand fragment B. + + C_dtype : str + The data type of accumulator fragment C. + + multiplicand_a : Var + The multiplicand fragment A variable. + + a_index : Expr + The index of multiplicand fragment A. + + multiplicand_b : Var + The multiplicand fragment B variable. + + b_index : Expr + The index of multiplicand fragment A. + + accumulator : Var + The accumulator fragment C variable. + + c_index : Expr + The index of accumulator fragment C. + + saturate : bool + The optional saturation at the output. + + + operator : Optional[Literal["xor", "and"]] + The 1-bit operator. + + Returns + ------- + call : PrimExpr + The call expression. + """ + if operator is None: + return call_intrin( + dtype, + "tir.ptx_mma", + shape, + A_layout, + B_layout, + A_dtype, + B_dtype, + C_dtype, + multiplicand_a, + a_index, + multiplicand_b, + b_index, + accumulator, + c_index, + saturate, + ) + return call_intrin( + dtype, + "tir.ptx_mma", + shape, + A_layout, + B_layout, + A_dtype, + B_dtype, + C_dtype, + multiplicand_a, + a_index, + multiplicand_b, + b_index, + accumulator, + c_index, + saturate, + operator, + ) + + +def ptx_mma_sp( + dtype, + shape, + A_layout, + B_layout, + A_dtype, + B_dtype, + C_dtype, + multiplicand_a, + a_index, + multiplicand_b, + b_index, + accumulator, + c_index, + metadata, + meta_index, + sparse_selector, + saturate, +): + """TVM intrinsic for sparse tensor core ptx instructions + https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-for-sparse-mma + + Parameters + ---------- + dtype : str + The data type of the result. + + shape : str + The shape of mma fragment. + + A_layout : Literal["row", "col"] + The layout of multiplicand fragment A. + + B_layout : Literal["row", "col"] + The layout of multiplicand fragment B. + + A_dtype : str + The data type of multiplicand fragment A. + + B_dtype : str + The data type of multiplicand fragment B. + + C_dtype : str + The data type of multiplicand fragment C. + + multiplicand_a : Var + The multiplicand fragment A variable. + + a_index : Expr + The index of multiplicand fragment A. + + multiplicand_b : Var + The multiplicand fragment B variable. + + b_index : Expr + The index of multiplicand fragment B. + + accumulator : Var + The accumulator fragment C variable. + + c_index : Expr + The index of accumulator fragment C. + + metadata : Expr + The metadata of operand. + + meta_index : Expr + The metadata index of operand. + + sparse_selector : Expr + The sparse selector indicating the thread that stores the metadata. + + saturate : bool + The optional saturation at the output. + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin( + dtype, + "tir.ptx_mma_sp", + shape, + A_layout, + B_layout, + A_dtype, + B_dtype, + C_dtype, + multiplicand_a, + a_index, + multiplicand_b, + b_index, + accumulator, + c_index, + metadata, + meta_index, + sparse_selector, + saturate, + ) + + +def mma_store(dtype, m, n, dst_ptr, src_ptr, src_offset, dst_stride): + """TVM intrinsic for storing the result of PTX MMA into a destination pointer + + Parameters + ---------- + dtype : str + The data type of the result. + + m : IntImm + The shape of mma fragment. + + n : IntImm + The shape of mma fragment. + + dst_ptr : Var + The destination pointer variable. + + src_ptr : Var + The source pointer variable. + + src_offset : Expr + The source offset. + + dst_stride : Var + The destination stride. + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin( + dtype, + "tir.mma_store", + m, + n, + dst_ptr, + src_ptr, + src_offset, + dst_stride, + ) + + +def mma_fill(dtype, local_size, local_ptr, offset): + """TVM intrinsic for zero-initalizing an MMA accumulation registor + + Parameters + ---------- + dtype : str + The data type of the result. + + local_size : IntImm + The number of elements. + + local_ptr : Var + The destination pointer variable. + + offset : Expr + The destination offset. + + Returns + ------- + call : PrimExpr + The call expression. + """ + return call_intrin( + dtype, + "tir.mma_fill", + local_size, + local_ptr, + offset, + ) + + def ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr, smem_offset): """TVM intrinsic for ptx load matrix from shared memory https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix diff --git a/tests/python/unittest/test_tir_op_types.py b/tests/python/unittest/test_tir_op_types.py index f8e8de074c42..23a264bef75a 100644 --- a/tests/python/unittest/test_tir_op_types.py +++ b/tests/python/unittest/test_tir_op_types.py @@ -143,6 +143,81 @@ def test_tir_op_tvm_fill_fragment(): assert expr.op.name == "tir.tvm_fill_fragment" +def test_tir_op_ptx_mma(): + buffer_a = tir.decl_buffer([32], "int4", scope="local") + buffer_b = tir.decl_buffer([16], "uint4", scope="local") + buffer_c = tir.decl_buffer([4], "int32", scope="local") + expr = tir.ptx_mma( + "int32", + "m8n8k32", + "row", + "col", + "int4", + "uint4", + "int32", + buffer_a.data, + 0, + buffer_b.data, + 0, + buffer_c.data, + 0, + False, + ) + assert expr.op.name == "tir.ptx_mma" + + +def test_tir_op_ptx_mma_sp(): + buffer_a = tir.decl_buffer([32], "int4", scope="local") + buffer_b = tir.decl_buffer([16], "uint4", scope="local") + buffer_c = tir.decl_buffer([4], "int32", scope="local") + buffer_d = tir.decl_buffer([1], "uint32", scope="local") + expr = tir.ptx_mma_sp( + "int32", + "m8n8k32", + "row", + "col", + "int4", + "uint4", + "int32", + buffer_a.data, + 0, + buffer_b.data, + 0, + buffer_c.data, + 0, + buffer_d.data, + 0, + 0, + False, + ) + assert expr.op.name == "tir.ptx_mma_sp" + + +def test_tir_op_mma_store(): + x = tir.Var("x", dtype="int32") + y = tir.Var("y", dtype="int32") + buffer_w = tir.decl_buffer([16, 8], dtype="int32", scope="warp", offset_factor=1) + buffer = tir.decl_buffer( + [16, 16], dtype="int32", scope="global", offset_factor=1, strides=[x, y] + ) + expr = tir.mma_store( + "int32", + 16, + 16, + buffer.access_ptr("w"), + buffer_w.data, + buffer_w.elem_offset, + x, + ) + assert expr.op.name == "tir.mma_store" + + +def test_tir_op_mma_fill(): + buffer_w = tir.decl_buffer([16, 8], dtype="int32", scope="warp", offset_factor=1) + expr = tir.mma_fill("int32", 8, buffer_w.data, buffer_w.elem_offset) + assert expr.op.name == "tir.mma_fill" + + def test_op_ptx_ldmatrix(): buffer_shared = tir.decl_buffer([16, 16], "float16", scope="shared") buffer_local = tir.decl_buffer([8], "float16", scope="local") From 648a29a53a641f1e923220600dce9c9215104879 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Mon, 29 Aug 2022 00:34:11 -0700 Subject: [PATCH 063/704] [MetaSchedule] Introduce `ScheduleFnDatabase` (#12626) Following #12520, this PR introduces `ScheduleFnDatabase`, a mocked database to allow injecting handcrafted schedules provided by a schedule function. The schedule function comes with the following signature: ```python def schedule_fn( sch: tir.Schedule, ) -> bool: task_name = sch.mod.attrs["task_name"] # ^^^ provides an optional name of the task queried ... ``` This mocked database helps incorporate the existing testing utility `apply_fixed_schedule` more formally into the MetaSchedule-Relay build pipeline, and allows further extension to Relax with the same interface. Next as another follow-up, we will introduce ConcatDatabase that allows mixing multiple databases, including the mocked and ones from JSON files. --- include/tvm/meta_schedule/database.h | 19 +++- python/tvm/meta_schedule/database/__init__.py | 1 + python/tvm/meta_schedule/database/database.py | 41 +++++-- .../database/schedule_fn_database.py | 38 +++++++ python/tvm/meta_schedule/testing/utils.py | 83 -------------- src/meta_schedule/database/database.cc | 13 ++- src/meta_schedule/database/memory_database.cc | 10 +- .../database/schedule_fn_database.cc | 103 ++++++++++++++++++ src/relay/backend/te_compiler_cache.cc | 5 +- tests/python/unittest/test_link_params.py | 15 ++- .../test_meta_schedule_multi_anchor.py | 8 +- .../test_meta_schedule_relay_tir_compute.py | 18 +-- .../unittest/test_meta_schedule_tune_relay.py | 7 +- 13 files changed, 226 insertions(+), 135 deletions(-) create mode 100644 python/tvm/meta_schedule/database/schedule_fn_database.py delete mode 100644 python/tvm/meta_schedule/testing/utils.py create mode 100644 src/meta_schedule/database/schedule_fn_database.cc diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h index 0e7f45d39332..88db2e227786 100644 --- a/include/tvm/meta_schedule/database.h +++ b/include/tvm/meta_schedule/database.h @@ -207,23 +207,29 @@ class DatabaseNode : public runtime::Object { * \brief Query the best record of the given workload from the database. * \param mod The IRModule to be searched for. * \param target The target to be searched for. + * \param workload_name The name of the workload to be searched for. * \return The best record of the given workload; NullOpt if not found. */ - virtual Optional QueryTuningRecord(IRModule mod, Target target); + virtual Optional QueryTuningRecord(const IRModule& mod, const Target& target, + const String& workload_name); /*! * \brief Query the best schedule of the given workload from the database. * \param mod The IRModule to be searched for. * \param target The target to be searched for. + * \param workload_name The name of the workload to be searched for. * \return The schedule in the best schedule of the given workload; NullOpt if not found. */ - virtual Optional QuerySchedule(IRModule mod, Target target); + virtual Optional QuerySchedule(const IRModule& mod, const Target& target, + const String& workload_name); /*! * \brief Query the best IRModule of the given workload from the database. * \param mod The IRModule to be searched for. * \param target The target to be searched for. + * \param workload_name The name of the workload to be searched for. * \return The IRModule in the best IRModule of the given workload; NullOpt if not found. */ - virtual Optional QueryIRModule(IRModule mod, Target target); + virtual Optional QueryIRModule(const IRModule& mod, const Target& target, + const String& workload_name); static constexpr const char* _type_key = "meta_schedule.Database"; TVM_DECLARE_BASE_OBJECT_INFO(DatabaseNode, runtime::Object); @@ -336,6 +342,13 @@ class Database : public runtime::ObjectRef { public: /*! An in-memory database. */ TVM_DLL static Database MemoryDatabase(); + /*! + * \brief A database for injecting handcrafted schedule functions. + * \param schedule_fn The function to do scheduling, which takes a TIR schedule, + * and returns a boolean indicating if the schedule is successful. + */ + TVM_DLL static Database ScheduleFnDatabase( + runtime::TypedPackedFunc schedule_fn); /*! * \brief Create a default database that uses JSON file for tuning records. * \param path_workload The path to the workload table. diff --git a/python/tvm/meta_schedule/database/__init__.py b/python/tvm/meta_schedule/database/__init__.py index 2a87eea147d9..7726daf6eb63 100644 --- a/python/tvm/meta_schedule/database/__init__.py +++ b/python/tvm/meta_schedule/database/__init__.py @@ -21,3 +21,4 @@ from .database import Database, PyDatabase, TuningRecord, Workload from .json_database import JSONDatabase from .memory_database import MemoryDatabase +from .schedule_fn_database import ScheduleFnDatabase diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py index 68283b4554e5..aa509b715132 100644 --- a/python/tvm/meta_schedule/database/database.py +++ b/python/tvm/meta_schedule/database/database.py @@ -235,7 +235,12 @@ def __len__(self) -> int: """ return _ffi_api.DatabaseSize(self) # type: ignore # pylint: disable=no-member - def query_tuning_record(self, mod: IRModule, target: Target) -> Optional[TuningRecord]: + def query_tuning_record( + self, + mod: IRModule, + target: Target, + workload_name: str, + ) -> Optional[TuningRecord]: """Query the best record of the given workload from the database. Parameters @@ -244,15 +249,22 @@ def query_tuning_record(self, mod: IRModule, target: Target) -> Optional[TuningR The IRModule to be searched for. target : Target The target to be searched for. + workload_name : str + The name of the workload to be searched for. Returns ------- tuning_record : Optional[TuningRecord] The best record of the given workload; None if not found. """ - return _ffi_api.DatabaseQueryTuningRecord(self, mod, target) # type: ignore # pylint: disable=no-member + return _ffi_api.DatabaseQueryTuningRecord(self, mod, target, workload_name) # type: ignore # pylint: disable=no-member - def query_schedule(self, mod: IRModule, target: Target) -> Optional[Schedule]: + def query_schedule( + self, + mod: IRModule, + target: Target, + workload_name: str, + ) -> Optional[Schedule]: """Query the best schedule of the given workload from the database. Parameters @@ -261,15 +273,22 @@ def query_schedule(self, mod: IRModule, target: Target) -> Optional[Schedule]: The IRModule to be searched for. target : Target The target to be searched for. + workload_name : str + The name of the workload to be searched for. Returns ------- schedule : Optional[Schedule] The best schedule of the given workload; None if not found. """ - return _ffi_api.DatabaseQuerySchedule(self, mod, target) # type: ignore # pylint: disable=no-member + return _ffi_api.DatabaseQuerySchedule(self, mod, target, workload_name) # type: ignore # pylint: disable=no-member - def query_ir_module(self, mod: IRModule, target: Target) -> Optional[IRModule]: + def query_ir_module( + self, + mod: IRModule, + target: Target, + workload_name: str, + ) -> Optional[IRModule]: """Query the best IRModule of the given workload from the database. Parameters @@ -278,18 +297,22 @@ def query_ir_module(self, mod: IRModule, target: Target) -> Optional[IRModule]: The IRModule to be searched for. target : Target The target to be searched for. + workload_name : str + The name of the workload to be searched for. Returns ------- ir_module : Optional[IRModule] The best IRModule of the given workload; None if not found. """ - return _ffi_api.DatabaseQueryIRModule(self, mod, target) # type: ignore # pylint: disable=no-member + return _ffi_api.DatabaseQueryIRModule(self, mod, target, workload_name) # type: ignore # pylint: disable=no-member def query( self, mod: IRModule, target: Target, + *, + workload_name: str = "main", kind: Union[ Literal["schedule"], Literal["record"], @@ -313,11 +336,11 @@ def query( The best optimization outcome of the given workload. """ if kind == "schedule": - return self.query_schedule(mod, target) + return self.query_schedule(mod, target, workload_name) if kind == "record": - return self.query_tuning_record(mod, target) + return self.query_tuning_record(mod, target, workload_name) if kind == "ir_module": - return self.query_ir_module(mod, target) + return self.query_ir_module(mod, target, workload_name) raise ValueError(f'Unknown kind: {kind}. Candidates are: "schedule", "record", "ir_module"') def __enter__(self) -> "Database": diff --git a/python/tvm/meta_schedule/database/schedule_fn_database.py b/python/tvm/meta_schedule/database/schedule_fn_database.py new file mode 100644 index 000000000000..2918f05799dc --- /dev/null +++ b/python/tvm/meta_schedule/database/schedule_fn_database.py @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""A database for injecting handcrafted schedule functions.""" +from typing import Callable + +from tvm._ffi import register_object +from tvm.tir import Schedule + +from .. import _ffi_api +from .database import Database + + +@register_object("meta_schedule.ScheduleFnDatabase") +class ScheduleFnDatabase(Database): + """A database for injecting handcrafted schedule functions.""" + + def __init__( + self, + schedule_fn: Callable[[Schedule], bool], + ) -> None: + self.__init_handle_by_constructor__( + _ffi_api.DatabaseScheduleFnDatabase, # type: ignore # pylint: disable=no-member + schedule_fn, + ) diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py deleted file mode 100644 index 5919fb47c809..000000000000 --- a/python/tvm/meta_schedule/testing/utils.py +++ /dev/null @@ -1,83 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Testing utility functions in meta schedule""" -from typing import Callable, Dict, Optional, Union - -from tvm import meta_schedule as ms -from tvm.ir import IRModule, transform -from tvm.relay import Function as RelayFunc -from tvm.runtime import NDArray -from tvm.target import Target -from tvm.tir import Schedule - - -def apply_fixed_schedules( - relay_mod: Union[RelayFunc, IRModule], - target: Union[str, Target], - params: Optional[Dict[str, NDArray]], - schedule_fn: Callable[[ms.ExtractedTask, Schedule], bool], - tir_converter: str = "default", -): - """Apply fixed schedules (manually written, without any tunable knobs) as specified by - schedule_fn to extracted tasks, and return a database that can be passed to compilation. - - Parameters - ---------- - mod : Union[RelayFunc, IRModule] - The Relay module to apply fixed schedules. - target : Union[str, Target] - The target used to extract tasks. - params : Optional[Dict[str, tvm.runtime.NDArray]] - The associated parameters of the module. - schedule_fn : Callable[[ExtractedTask, Schedule], bool] - A callable that is applied for each extracted task and the corresponding default schedule. - Returns True if the given schedule should be committed to the database, False otherwise. - tir_converter : str - The filter function to filter out the extracted tasks. Builtin filters: - - "default" - - "allow_extern" - The converter is a PackedFunc registered as f"relay.backend.tir_converter.{tir_converter}", - with the signature below: - (args: List[te.Tensor], constants: List[NDArray]) -> Optional[tir.PrimFunc] - - Returns - ------- - database : Database - The database containing dummy tuning records for manually scheduled traces. - """ - target = Target(target) if isinstance(target, str) else target - config = {"relay.backend.use_meta_schedule": True} - for k, v in transform.PassContext.current().config.items(): - config[k] = v - - extracted_tasks = ms.extract_task_from_relay( - relay_mod, - target, - params, - tir_converter=tir_converter, - ) - database = ms.database.MemoryDatabase() - for task in extracted_tasks: - mod = ms.default_config.mod(task.dispatched[0]) - sch = Schedule(mod) - - if schedule_fn(task, sch): - workload = database.commit_workload(mod) - tune_rec = ms.database.TuningRecord(sch.trace, workload, [0.0], target, []) - database.commit_tuning_record(tune_rec) - - return database diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc index fedd2aa35278..d082ff7a3901 100644 --- a/src/meta_schedule/database/database.cc +++ b/src/meta_schedule/database/database.cc @@ -156,7 +156,8 @@ TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& w /******** Database ********/ -Optional DatabaseNode::QueryTuningRecord(IRModule mod, Target target) { +Optional DatabaseNode::QueryTuningRecord(const IRModule& mod, const Target& target, + const String& workload_name) { if (!this->HasWorkload(mod)) { return NullOpt; } @@ -168,8 +169,9 @@ Optional DatabaseNode::QueryTuningRecord(IRModule mod, Target targ return records[0]; } -Optional DatabaseNode::QuerySchedule(IRModule mod, Target target) { - if (Optional opt_record = this->QueryTuningRecord(mod, target)) { +Optional DatabaseNode::QuerySchedule(const IRModule& mod, const Target& target, + const String& workload_name) { + if (Optional opt_record = this->QueryTuningRecord(mod, target, workload_name)) { TuningRecord record = opt_record.value(); tir::Schedule sch = tir::Schedule::Traced(record->workload->mod, /*seed=*/-1, /*debug_mask=*/0, @@ -181,8 +183,9 @@ Optional DatabaseNode::QuerySchedule(IRModule mod, Target target) } } -Optional DatabaseNode::QueryIRModule(IRModule mod, Target target) { - if (Optional opt_sch = this->QuerySchedule(mod, target)) { +Optional DatabaseNode::QueryIRModule(const IRModule& mod, const Target& target, + const String& workload_name) { + if (Optional opt_sch = this->QuerySchedule(mod, target, workload_name)) { return opt_sch.value()->mod(); } else { return NullOpt; diff --git a/src/meta_schedule/database/memory_database.cc b/src/meta_schedule/database/memory_database.cc index a00d5501ad1d..b6c635555152 100644 --- a/src/meta_schedule/database/memory_database.cc +++ b/src/meta_schedule/database/memory_database.cc @@ -44,7 +44,7 @@ class MemoryDatabaseNode : public DatabaseNode { return false; } - Workload CommitWorkload(const IRModule& mod) { + Workload CommitWorkload(const IRModule& mod) final { for (const auto& workload : workloads) { if (StructuralEqual()(workload->mod, mod)) { return workload; @@ -55,9 +55,9 @@ class MemoryDatabaseNode : public DatabaseNode { return workload; } - void CommitTuningRecord(const TuningRecord& record) { records.push_back(record); } + void CommitTuningRecord(const TuningRecord& record) final { records.push_back(record); } - Array GetTopK(const Workload& workload, int top_k) { + Array GetTopK(const Workload& workload, int top_k) final { std::vector> results; results.reserve(this->records.size()); for (const TuningRecord& record : records) { @@ -91,9 +91,9 @@ class MemoryDatabaseNode : public DatabaseNode { return ret; } - Array GetAllTuningRecords() { return records; } + Array GetAllTuningRecords() final { return records; } - int64_t Size() { return records.size(); } + int64_t Size() final { return records.size(); } }; Database Database::MemoryDatabase() { diff --git a/src/meta_schedule/database/schedule_fn_database.cc b/src/meta_schedule/database/schedule_fn_database.cc new file mode 100644 index 000000000000..751721fe52d4 --- /dev/null +++ b/src/meta_schedule/database/schedule_fn_database.cc @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "../utils.h" + +namespace tvm { +namespace meta_schedule { + +class ScheduleFnDatabaseNode : public DatabaseNode { + public: + runtime::TypedPackedFunc schedule_fn; + + void VisitAttrs(AttrVisitor* v) { + // `schedule_fn` is not visited. + } + + static constexpr const char* _type_key = "meta_schedule.ScheduleFnDatabase"; + TVM_DECLARE_FINAL_OBJECT_INFO(ScheduleFnDatabaseNode, DatabaseNode); + + public: + Optional QueryTuningRecord(const IRModule& mod, const Target& target, + const String& workload_name) final { + if (Optional sch = this->QuerySchedule(mod, target, workload_name)) { + return TuningRecord(sch.value()->trace().value(), + /*workload=*/Workload(mod, 0), // + /*run_secs=*/NullOpt, // + /*target=*/target, // + /*arg_info=*/NullOpt); + } + return NullOpt; + } + + Optional QuerySchedule(const IRModule& mod, const Target& target, + const String& workload_name) final { + tir::Schedule sch = + tir::Schedule::Traced(WithAttr(mod, "task_name", workload_name), + /*rand_state=*/-1, + /*debug_mode=*/0, + /*error_render_level=*/tir::ScheduleErrorRenderLevel::kDetail); + if (!schedule_fn(sch)) { + return NullOpt; + } + return sch; + } + + bool HasWorkload(const IRModule& mod) final { + LOG(FATAL) << "NotImplementedError: ScheduleFnDatabase.HasWorkload"; + throw; + } + + Workload CommitWorkload(const IRModule& mod) final { + LOG(FATAL) << "NotImplementedError: ScheduleFnDatabase.CommitWorkload"; + throw; + } + + void CommitTuningRecord(const TuningRecord& record) final { + LOG(FATAL) << "NotImplementedError: ScheduleFnDatabase.CommitTuningRecord"; + throw; + } + + Array GetTopK(const Workload& workload, int top_k) final { + LOG(FATAL) << "NotImplementedError: ScheduleFnDatabase.GetTopK"; + throw; + } + + Array GetAllTuningRecords() final { + LOG(FATAL) << "NotImplementedError: ScheduleFnDatabase.GetAllTuningRecords"; + throw; + } + + int64_t Size() final { + LOG(FATAL) << "NotImplementedError: ScheduleFnDatabase.size"; + throw; + } +}; + +Database Database::ScheduleFnDatabase(runtime::TypedPackedFunc schedule_fn) { + ObjectPtr n = make_object(); + n->schedule_fn = std::move(schedule_fn); + return Database(n); +} + +TVM_REGISTER_NODE_TYPE(ScheduleFnDatabaseNode); +TVM_REGISTER_GLOBAL("meta_schedule.DatabaseScheduleFnDatabase") + .set_body_typed(Database::ScheduleFnDatabase); + +} // namespace meta_schedule +} // namespace tvm diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc index 0e2a3e270257..1d7566ebe2bd 100644 --- a/src/relay/backend/te_compiler_cache.cc +++ b/src/relay/backend/te_compiler_cache.cc @@ -367,7 +367,8 @@ class ScheduleBuilder : public ExprVisitor { if (Optional f = tir_converter(te_args, constants)) { if (Optional opt_record = database_.value()->QueryTuningRecord( /*mod=*/backend::PrimFuncToIRModule(f.value()), - /*target=*/target_)) { + /*target=*/target_, + /*workload_name=*/prim_fn_var->name_hint)) { static InstructionKind kind_transform_layout = InstructionKind::Get("TransformLayout"); TuningRecord record = opt_record.value(); for (const Instruction& inst : record->trace->insts) { @@ -383,6 +384,8 @@ class ScheduleBuilder : public ExprVisitor { ICHECK_EQ(mod->functions.size(), 1); mod = tir::transform::RemoveWeightLayoutRewriteBlock()(std::move(mod)); prim_func = Downcast(mod->Lookup("main")); + } else { + LOG(WARNING) << "Cannot find workload: " << prim_fn_var->name_hint; } } } diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py index c741ecb59ae0..b14c18e55f4b 100644 --- a/tests/python/unittest/test_link_params.py +++ b/tests/python/unittest/test_link_params.py @@ -29,7 +29,6 @@ from tvm import meta_schedule as ms from tvm import relay from tvm.contrib import utils -from tvm.meta_schedule.testing.utils import apply_fixed_schedules from tvm.relay.backend import Executor, Runtime INPUT_SHAPE = (1, 3, 16, 16) @@ -407,21 +406,21 @@ def schedule_dense(sch): target = "llvm" params = {"weight": weight_np} - def schedule_fn(task, sch): - if "nn_dense" in task.task_name: + def schedule_fn(sch): + if "nn_dense" in sch.mod.attrs["task_name"]: schedule_dense(sch) return True return False link_params = True - with tvm.transform.PassContext(config={"relay.FuseOps.link_params": link_params}): - database = apply_fixed_schedules(relay_mod, target, params, schedule_fn) - with StringIO() as stderr_buf, redirect_stderr(stderr_buf): - with database, tvm.transform.PassContext( + with ms.database.ScheduleFnDatabase(schedule_fn), tvm.transform.PassContext( opt_level=3, - config={"relay.backend.use_meta_schedule": True}, + config={ + "relay.backend.use_meta_schedule": True, + "relay.FuseOps.link_params": link_params, + }, ): executor = Executor("graph", {"link-params": link_params}) lib = relay.build(relay_mod, target=target, executor=executor) diff --git a/tests/python/unittest/test_meta_schedule_multi_anchor.py b/tests/python/unittest/test_meta_schedule_multi_anchor.py index 177001781179..cb6f59c6e5d5 100644 --- a/tests/python/unittest/test_meta_schedule_multi_anchor.py +++ b/tests/python/unittest/test_meta_schedule_multi_anchor.py @@ -19,7 +19,6 @@ import tvm.testing from tvm import meta_schedule as ms from tvm import relay -from tvm.meta_schedule.testing.utils import apply_fixed_schedules def get_dense_dense(data_shape, weight_shape): @@ -63,14 +62,13 @@ def test_dense_dense(): target = "llvm" params = {"weight1": weight1_np, "weight2": weight2_np} - def schedule_fn(task, sch): - if "nn_dense_nn_dense" in task.task_name: + def schedule_fn(sch): + if "nn_dense_nn_dense" in sch.mod.attrs["task_name"]: schedule_dense_dense(sch) return True return False - database = apply_fixed_schedules(relay_mod, target, params, schedule_fn) - with database: + with ms.database.ScheduleFnDatabase(schedule_fn): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, diff --git a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py index 939851a65731..b37333803603 100644 --- a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py +++ b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py @@ -18,8 +18,9 @@ import tvm import tvm.testing import tvm.topi.testing -from tvm import autotvm, relay, te -from tvm.meta_schedule.testing.utils import apply_fixed_schedules +from tvm import autotvm +from tvm import meta_schedule as ms +from tvm import relay, te from tvm.relay.testing.temp_op_attr import TempOpAttr from tvm.script import tir as T @@ -139,21 +140,14 @@ def test_conv2d(): target = "llvm" params = {"weight": weight_np} - def schedule_fn(task, sch): - if "nn_conv2d" in task.task_name: + def schedule_fn(sch): + if "nn_conv2d" in sch.mod.attrs["task_name"]: schedule_tir_conv2d_nchw_oihw(sch) return True return False with TempOpAttr("nn.conv2d", "FTVMStrategy", _tmp_strategy): - database = apply_fixed_schedules( - relay_mod, - target, - params, - schedule_fn, - tir_converter="allow_extern", - ) - with database, tvm.transform.PassContext( + with ms.database.ScheduleFnDatabase(schedule_fn), tvm.transform.PassContext( opt_level=3, config={ "relay.backend.use_meta_schedule": True, diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py index bc37fed7d691..b05b57feaf4c 100644 --- a/tests/python/unittest/test_meta_schedule_tune_relay.py +++ b/tests/python/unittest/test_meta_schedule_tune_relay.py @@ -29,7 +29,6 @@ from tvm.contrib import graph_executor from tvm.ir import IRModule from tvm.meta_schedule.testing.relay_workload import get_network -from tvm.meta_schedule.testing.utils import apply_fixed_schedules from tvm.script import tir as T from tvm.target.target import Target from tvm.tir.schedule import BlockRV, Schedule @@ -452,8 +451,8 @@ def manual_tir_common(do_tune=False): ) else: - def schedule_fn(task, sch): - if "dense" not in task.task_name: + def schedule_fn(sch) -> bool: + if "dense" not in sch.mod.attrs["task_name"]: return False block = sch.get_block("compute") @@ -468,7 +467,7 @@ def schedule_fn(task, sch): return True - database = apply_fixed_schedules(relay_mod, target, params, schedule_fn) + database = ms.database.ScheduleFnDatabase(schedule_fn) with database, tvm.transform.PassContext( opt_level=3, From 3d41ac3a9ab58ba5f7d3182e6afe915924568f8d Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Mon, 29 Aug 2022 02:29:24 -0700 Subject: [PATCH 064/704] [Refactor] Replace std::tie with structured bindings (#12610) * [Refactor] Replace std::tie with structured bindings With C++17 enabled in https://github.com/apache/tvm/pull/12337, using structured bindings to replace cases where `std::tie` is used to define local variables. * Added missing header for * Silenced unused variable warnings after structured bindings This is a bug in gcc version 7, resolved in gcc 8. While gcc version 7 is used for CI, we'll need to silence unused variable warnings resulting from using only part of a structured binding. More information: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767 --- src/auto_scheduler/auto_schedule.cc | 4 +- src/auto_scheduler/compute_dag.cc | 17 +++---- src/auto_scheduler/feature.cc | 9 +--- .../search_policy/search_policy.cc | 4 +- .../search_policy/sketch_policy_rules.cc | 3 +- src/ir/instrument.cc | 5 +- src/meta_schedule/database/json_database.cc | 4 +- .../mutator/mutate_compute_location.cc | 4 +- .../schedule_rule/cross_thread_reduction.cc | 6 +-- .../space_generator/post_order_apply.cc | 4 +- src/relay/collage/partition_rule.cc | 12 ++--- src/relay/collage/sub_graph.cc | 8 +-- src/relay/qnn/op/convolution.cc | 4 +- src/relay/qnn/op/leaky_relu.cc | 6 +-- src/relay/qnn/op/requantize.cc | 3 +- src/relay/qnn/utils.cc | 6 +-- src/relay/quantize/realize.cc | 6 +-- .../transforms/combine_parallel_conv2d.cc | 4 +- .../transforms/combine_parallel_dense.cc | 4 +- src/runtime/graph_executor/graph_executor.cc | 4 +- src/target/source/ptx.cc | 12 ++--- src/te/autodiff/ad_simplify.cc | 10 ++-- src/te/autodiff/ad_utils.cc | 8 +-- src/te/autodiff/jacobian.cc | 4 +- src/tir/schedule/analysis/analysis.cc | 11 ++-- src/tir/schedule/primitive/block_annotate.cc | 4 +- .../primitive/layout_transformation.cc | 12 ++--- .../schedule/primitive/loop_transformation.cc | 4 +- src/tir/schedule/primitive/reduction.cc | 12 ++--- src/tir/schedule/primitive/sampling.cc | 4 +- src/tir/transforms/loop_partition.cc | 51 ++++++++++++------- .../lower_cross_thread_reduction.cc | 15 ++---- src/tir/transforms/lower_thread_allreduce.cc | 5 +- src/tir/transforms/lower_warp_memory.cc | 11 ++-- .../manifest_shared_memory_local_stage.cc | 10 +--- 35 files changed, 105 insertions(+), 185 deletions(-) diff --git a/src/auto_scheduler/auto_schedule.cc b/src/auto_scheduler/auto_schedule.cc index 747aa01cfa05..41aa49c77193 100755 --- a/src/auto_scheduler/auto_schedule.cc +++ b/src/auto_scheduler/auto_schedule.cc @@ -78,9 +78,7 @@ TVM_REGISTER_GLOBAL("auto_scheduler.TuningOptions") TVM_REGISTER_GLOBAL("auto_scheduler.AutoSchedule") .set_body_typed([](SearchPolicy search_policy, TuningOptions tuning_options) { - te::Schedule sch; - Array return_tensors; - std::tie(sch, return_tensors) = AutoSchedule(search_policy, tuning_options); + auto [sch, return_tensors] = AutoSchedule(search_policy, tuning_options); return Array{sch, return_tensors}; }); } // namespace auto_scheduler diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc index dad55db0303f..5500707fb9af 100644 --- a/src/auto_scheduler/compute_dag.cc +++ b/src/auto_scheduler/compute_dag.cc @@ -1325,10 +1325,9 @@ State ComputeDAG::InferBound(const State& state) const { Array stages; StageToAxesMap stage_to_axes; - te::Schedule sch; - Array tensors; // Replay steps to tvm::Schedule - std::tie(sch, tensors) = ApplySteps(pstate->transform_steps, &stages, &stage_to_axes); + auto [sch, tensors] = ApplySteps(pstate->transform_steps, &stages, &stage_to_axes); + (void)tensors; // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767 sch = sch.normalize_for_feature_extraction(); // Get bound information from TVM schedule Map bounds = te::InferBound(sch); @@ -1382,9 +1381,8 @@ Array ComputeDAG::InferBound(const Array& states) const { } ComputeDAG ComputeDAG::ReplayAndGetDAG(const Array& transform_steps) const { - te::Schedule sch; - Array old_tensors; - std::tie(sch, old_tensors) = ApplySteps(transform_steps); + auto [sch, old_tensors] = ApplySteps(transform_steps); + (void)old_tensors; // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767 return ComputeDAG(sch); } @@ -1481,11 +1479,8 @@ TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAG") TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGApplyStepsFromState") .set_body_typed([](const ComputeDAG& dag, const State& state, int layout_rewrite) { - te::Schedule sch; - Array return_tensors; - std::tie(sch, return_tensors) = - dag.ApplySteps(state->transform_steps, nullptr, nullptr, - static_cast(layout_rewrite)); + auto [sch, return_tensors] = dag.ApplySteps(state->transform_steps, nullptr, nullptr, + static_cast(layout_rewrite)); return Array{sch, return_tensors}; }); diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc index c930bf0c4e73..e079018151a7 100644 --- a/src/auto_scheduler/feature.cc +++ b/src/auto_scheduler/feature.cc @@ -952,9 +952,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { unique_lines = std::max(unique_lines, 1.0f); } - ReuseType reuse_type; - float reuse_dis_iter, reuse_dis_bytes, reuse_ct; - std::tie(reuse_type, reuse_dis_iter, reuse_dis_bytes, reuse_ct) = + auto [reuse_type, reuse_dis_iter, reuse_dis_bytes, reuse_ct] = ComputeReuse(t, acc.indices, for_loop_stack_, for_touch_regions_, ana_); acc_feas.emplace_back(); @@ -1356,10 +1354,7 @@ void GetPerStoreFeatureName(int max_n_bufs, std::vector* ret) { void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, int max_n_bufs, std::vector* feature, std::atomic* error_ct) { - te::Schedule sch; - Array tensors; - - std::tie(sch, tensors) = task->compute_dag.ApplySteps(state->transform_steps); + auto [sch, tensors] = task->compute_dag.ApplySteps(state->transform_steps); // When inlining, replace const matrices with const values. // Produces wrong IR, but good enough for feature extraction, and diff --git a/src/auto_scheduler/search_policy/search_policy.cc b/src/auto_scheduler/search_policy/search_policy.cc index 702eec087668..196bee8ff0e2 100644 --- a/src/auto_scheduler/search_policy/search_policy.cc +++ b/src/auto_scheduler/search_policy/search_policy.cc @@ -106,9 +106,7 @@ TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyRunCallbacks") TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyContinueSearchOneRound") .set_body_typed([](SearchPolicy policy, int num_measure, ProgramMeasurer measurer) { - Array inputs; - Array results; - std::tie(inputs, results) = policy->ContinueSearchOneRound(num_measure, measurer); + auto [inputs, results] = policy->ContinueSearchOneRound(num_measure, measurer); return Array{inputs, results}; }); diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc index 8df69fc7ce3b..862e593c9dd3 100644 --- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc +++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc @@ -343,8 +343,7 @@ SketchGenerationRule::ConditionKind RuleCrossThreadReduction::MeetCondition( const auto& op = state->stages[stage_id]->op; if (op->IsInstance()) { // Compute the product of lengths of all space iters and all reduce iters - int cum_space_len, cum_reduce_len; - std::tie(cum_space_len, cum_reduce_len) = + auto [cum_space_len, cum_reduce_len] = GetCumulativeSpaceAndReductionLength(state->stages[stage_id]); if (NeedsMultilevelTiling(policy.search_task, state, stage_id)) { diff --git a/src/ir/instrument.cc b/src/ir/instrument.cc index 795e5b8cb542..6701308fbfb7 100644 --- a/src/ir/instrument.cc +++ b/src/ir/instrument.cc @@ -288,10 +288,7 @@ String RenderPassProfiles() { os << std::fixed; while (profiles.size() > 0) { - size_t depth; - PassProfile::Duration parent_duration; - PassProfile* profile; - std::tie(depth, parent_duration, profile) = profiles.top(); + auto [depth, parent_duration, profile] = profiles.top(); profiles.pop(); // indent depth diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc index f8fb64e92407..2e4f85260835 100644 --- a/src/meta_schedule/database/json_database.cc +++ b/src/meta_schedule/database/json_database.cc @@ -115,9 +115,7 @@ class JSONDatabaseNode : public DatabaseNode { Workload CommitWorkload(const IRModule& mod) { // Try to insert `mod` into `workloads_` - decltype(this->workloads2idx_)::iterator it; - bool inserted = false; - std::tie(it, inserted) = + auto [it, inserted] = this->workloads2idx_.emplace(Workload(mod, tvm::StructuralHash()(mod)), -1); Workload workload = it->first; // If `mod` is new in `workloads2idx_`, append it to the workload file diff --git a/src/meta_schedule/mutator/mutate_compute_location.cc b/src/meta_schedule/mutator/mutate_compute_location.cc index 3ed56df1b381..9d6d69ba355f 100644 --- a/src/meta_schedule/mutator/mutate_compute_location.cc +++ b/src/meta_schedule/mutator/mutate_compute_location.cc @@ -86,9 +86,7 @@ std::vector MutateComputeLocationNode::Fin int old_decision = Downcast(decision)->value; // Step 2. Collect all the compute_at locations. - Array location_srefs; - std::vector location_indices; - std::tie(location_srefs, location_indices) = CollectComputeLocation(sch->state(), block_sref); + auto [location_srefs, location_indices] = CollectComputeLocation(sch->state(), block_sref); // Step 3. Remove the old decision. auto it = std::find(location_indices.begin(), location_indices.end(), old_decision); if (it != location_indices.end()) { diff --git a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc index 242f1aea89c5..0f0ab99e7259 100644 --- a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc +++ b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc @@ -64,15 +64,11 @@ class CrossThreadReductionNode : public ScheduleRuleNode { // Step 2. Check the opportunity for block fusion. We say "fusible", if we can compute-at the // block to its consumers. We want to fuse as much as possible because it results in // significantly faster schedule. - bool fusible = false; // `target_loop` is the loop position where the input block will be computed at. - tir::LoopRV target_loop{nullptr}; // `target_block` is the consumer block that we want to compute-at the input block to. - tir::BlockRV target_block{nullptr}; // `tgt_block_innermost_loop` is the innermost loop outside the target block. - tir::LoopRV tgt_block_innermost_loop{nullptr}; - std::tie(fusible, target_loop, target_block, tgt_block_innermost_loop) = + auto [fusible, target_loop, target_block, tgt_block_innermost_loop] = GetComputeTargetLoopAndBlock(tmp_sch, block_rv); // Step 3. Try block fusion. diff --git a/src/meta_schedule/space_generator/post_order_apply.cc b/src/meta_schedule/space_generator/post_order_apply.cc index eab084f8978f..9be89e2d9c70 100644 --- a/src/meta_schedule/space_generator/post_order_apply.cc +++ b/src/meta_schedule/space_generator/post_order_apply.cc @@ -140,9 +140,7 @@ class PostOrderApplyNode : public SpaceGeneratorNode { result.clear(); while (!stack.empty()) { // get the stack.top() - tir::Schedule sch; - Array blocks; - std::tie(sch, blocks) = stack.back(); + auto [sch, blocks] = stack.back(); stack.pop_back(); // if all blocks are visited if (blocks.empty()) { diff --git a/src/relay/collage/partition_rule.cc b/src/relay/collage/partition_rule.cc index e11f740acfe9..1d8c5e9723ee 100644 --- a/src/relay/collage/partition_rule.cc +++ b/src/relay/collage/partition_rule.cc @@ -92,9 +92,7 @@ std::vector DFPatternPartitionRuleNode::AllCandidates( continue; } IndexSet inside = MatcherToIndexSet(matcher); - OpPatternKind kind; - String label; - std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside); + auto [kind, label] = SubGraphKindAndLabel(dataflow_graph, inside); SubGraph sub_graph(dataflow_graph, std::move(inside), kind, std::move(label)); String rule_name = rule_name_.empty() ? sub_graph->label_ : rule_name_; CandidatePartition candidate(std::move(rule_name), std::move(sub_graph), spec); @@ -256,9 +254,7 @@ std::vector OpCallByKindPartitionRuleNode::AllCandidates( auto node = dataflow_graph.index_to_node(index); Expr sub_expr = node->ref(); if (sub_expr->IsInstance()) { - OpPatternKind kind; - String label; - std::tie(kind, label) = SubExprKindAndLabel(sub_expr); + auto [kind, label] = SubExprKindAndLabel(sub_expr); if (kind <= kOutEWiseFusable) { IndexSet inside(dataflow_graph.size(), {index}); SubGraph sub_graph(dataflow_graph, std::move(inside), kind, std::move(label)); @@ -404,9 +400,7 @@ std::vector HostPartitionRuleNode::AllCandidates( continue; } IndexSet inside(dataflow_graph.size(), {index}); - OpPatternKind kind; - String label; - std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside); + auto [kind, label] = SubGraphKindAndLabel(dataflow_graph, inside); SubGraph sub_graph(dataflow_graph, std::move(inside), kind, label); String rule_name = NestLabels(rule_name_, sub_graph->label_); // We'll a zero cost for the candidate since we'll never want to actually estimate the cost diff --git a/src/relay/collage/sub_graph.cc b/src/relay/collage/sub_graph.cc index 63edc8c079fb..dee72093fd2f 100644 --- a/src/relay/collage/sub_graph.cc +++ b/src/relay/collage/sub_graph.cc @@ -439,9 +439,7 @@ std::pair SubGraphKindAndLabel(const DataflowGraph& bool first = true; OpPatternKind max_kind = kElemWise; for (PostDfsIndex index : inside) { - OpPatternKind sub_kind; - std::string sub_label; - std::tie(sub_kind, sub_label) = SubExprKindAndLabel(dataflow_graph.index_to_node(index)->ref()); + auto [sub_kind, sub_label] = SubExprKindAndLabel(dataflow_graph.index_to_node(index)->ref()); if (!sub_label.empty()) { if (first) { first = false; @@ -995,9 +993,7 @@ transform::Pass PartitionForTesting(Integer max_exits, Bool allow_taps, String c // Build the overall sub-graph, which will include any "Composite" functions as // well as any nodes without a label. IndexSet inside(dataflow_graph.size(), node_indexes); - OpPatternKind kind; - String label; - std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside); + auto [kind, label] = SubGraphKindAndLabel(dataflow_graph, inside); SubGraph sub_graph(dataflow_graph, inside, kind, label, std::move(nested_sub_graphs)); // Push the overall sub-graph into the final "Compiler" function. diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc index 42e4540f0f2c..64a5a02e6e25 100644 --- a/src/relay/qnn/op/convolution.cc +++ b/src/relay/qnn/op/convolution.cc @@ -722,9 +722,9 @@ Expr QnnConv2DCanonicalize(const Attrs& attrs, const Array& new_args, << "qnn.conv2d supports only OIHW/HWIO/HWOI/OHWI kernel data layout."; ICHECK(param->kernel_size.defined()) << "qnn.conv2d requires kernel size to be specified."; - int batch_size, in_channels, out_channels, kernel_h, kernel_w, channel_multiplier; - std::tie(batch_size, in_channels, out_channels, kernel_h, kernel_w, channel_multiplier) = + auto [batch_size, in_channels, out_channels, kernel_h, kernel_w, channel_multiplier] = GetWorkload(arg_types, param); + (void)batch_size; // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767 // zero points are allowed to be non-scalar. Let's check if that's the case. bool dynamic_zp = false; diff --git a/src/relay/qnn/op/leaky_relu.cc b/src/relay/qnn/op/leaky_relu.cc index 75bfabb7db85..458fde0d8a08 100644 --- a/src/relay/qnn/op/leaky_relu.cc +++ b/src/relay/qnn/op/leaky_relu.cc @@ -125,13 +125,11 @@ Expr QnnLeakyReluCanonicalize(const Attrs& attrs, const Array& new_args, output_zero_point, input_shape); // alpha * Q_i' - int32_t fixed_point_multiplier, shift; - std::tie(fixed_point_multiplier, shift) = GetFixedPointMultiplierShift(alpha); + auto [fixed_point_multiplier, shift] = GetFixedPointMultiplierShift(alpha); auto prod = FixedPointMultiply(requantized_expr, fixed_point_multiplier, shift); // (1 - alpha) * zp_o - int32_t fixed_point_multiplier_z, shift_z; - std::tie(fixed_point_multiplier_z, shift_z) = GetFixedPointMultiplierShift(1 - alpha); + auto [fixed_point_multiplier_z, shift_z] = GetFixedPointMultiplierShift(1 - alpha); auto scaled_z = FixedPointMultiply(output_zero_point, fixed_point_multiplier_z, shift_z); // alpha * Q_i' + (1 - alpha) * zp_o diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc index 5bf53a95edda..ae321b459788 100644 --- a/src/relay/qnn/op/requantize.cc +++ b/src/relay/qnn/op/requantize.cc @@ -223,8 +223,7 @@ Expr RequantizeLowerInt(const Expr& input_tensor, const Expr& input_scale, static_cast(input_scale_float) / static_cast(output_scale_float); // Skip if input and output scales are same. if (!IsEqualScalar(input_scale, output_scale)) { - int32_t fixed_point_multiplier, shift; - std::tie(fixed_point_multiplier, shift) = GetFixedPointMultiplierShift(double_multiplier); + auto [fixed_point_multiplier, shift] = GetFixedPointMultiplierShift(double_multiplier); const bool is_upward_rounding = (param->rounding == "UPWARD"); diff --git a/src/relay/qnn/utils.cc b/src/relay/qnn/utils.cc index 7dfd788d96c6..ed7a415cf6af 100644 --- a/src/relay/qnn/utils.cc +++ b/src/relay/qnn/utils.cc @@ -64,8 +64,7 @@ Expr FixedPointMultiplyToNearest(Expr tensor, double multiplier, tensor = Cast(tensor, hp_dtype); // 1) Calculating the integer multiplier and integer shift - int32_t fixed_point_multiplier, shift; - std::tie(fixed_point_multiplier, shift) = GetFixedPointMultiplierShift(multiplier); + auto [fixed_point_multiplier, shift] = GetFixedPointMultiplierShift(multiplier); int left_shift = shift > 0 ? shift : 0; int right_shift = shift > 0 ? 0 : -shift; @@ -128,8 +127,7 @@ Expr FixedPointMultiplyPerChannel(Expr tensor, std::vector multipliers, std::vector fixed_pt_multipliers, lshifts, rshifts; bool is_lshift_required = false; for (auto multiplier : multipliers) { - int32_t fixed_pt_multiplier, shift; - std::tie(fixed_pt_multiplier, shift) = GetFixedPointMultiplierShift(multiplier); + auto [fixed_pt_multiplier, shift] = GetFixedPointMultiplierShift(multiplier); int lshift = shift > 0 ? shift : 0; int rshift = shift > 0 ? 0 : -shift; fixed_pt_multipliers.push_back(fixed_pt_multiplier); diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc index 5766c62eaa43..720ef25cd33d 100644 --- a/src/relay/quantize/realize.cc +++ b/src/relay/quantize/realize.cc @@ -77,8 +77,7 @@ inline Expr MulAndDiv(Expr data, float s1, float s2, DataType dtype, return Multiply(data, MakeConstantScalar(dtype, factor)); } else { if (cfg->rounding == "UPWARD") { - int32_t fixed_point_multiplier, shift; - std::tie(fixed_point_multiplier, shift) = qnn::GetFixedPointMultiplierShift(factor); + auto [fixed_point_multiplier, shift] = qnn::GetFixedPointMultiplierShift(factor); data = relay::FixedPointMultiply(data, fixed_point_multiplier, shift); } else { data = qnn::FixedPointMultiplyToNearest(data, factor, data_shape); @@ -135,8 +134,7 @@ Expr QuantizeRealize(const Call& ref_call, const Array& new_args, const Ob } else { data = Cast(data, DataType::Int(64)); if (cfg->rounding == "UPWARD") { - int32_t fixed_point_multiplier, shift; - std::tie(fixed_point_multiplier, shift) = + auto [fixed_point_multiplier, shift] = qnn::GetFixedPointMultiplierShift(idom_scale_imm / odom_scale_imm); data = relay::FixedPointMultiply(data, fixed_point_multiplier, shift); } else { diff --git a/src/relay/transforms/combine_parallel_conv2d.cc b/src/relay/transforms/combine_parallel_conv2d.cc index 20b206e0423c..9c7bcc27ec82 100644 --- a/src/relay/transforms/combine_parallel_conv2d.cc +++ b/src/relay/transforms/combine_parallel_conv2d.cc @@ -83,9 +83,7 @@ class ParallelConv2DCombiner : public ParallelOpCombiner { Call MakeCombinedOp(const Group& branches) { const Op& conv2d = Op::Get("nn.conv2d"); Expr data = branches[0][0]->args[0]; - Expr new_weight; - IndexExpr new_channels; - std::tie(new_weight, new_channels) = TransformWeight(branches); + auto [new_weight, new_channels] = TransformWeight(branches); const CallNode* group_root = branches[0][0]; const auto* attrs = group_root->attrs.as(); diff --git a/src/relay/transforms/combine_parallel_dense.cc b/src/relay/transforms/combine_parallel_dense.cc index d5404ba30f90..7cf102b5bcab 100644 --- a/src/relay/transforms/combine_parallel_dense.cc +++ b/src/relay/transforms/combine_parallel_dense.cc @@ -116,10 +116,8 @@ class ParallelDenseToDenseCombiner : public ParallelOpCombiner { Call MakeCombinedOp(const Group& branches) { const Op& dense_op = Op::Get("nn.dense"); Expr input = branches[0][0]->args[0]; - Expr new_weight; - IndexExpr new_output_dims; // concat all weights into one - std::tie(new_weight, new_output_dims) = TransformWeight(branches); + auto [new_weight, new_output_dims] = TransformWeight(branches); const auto* origin_attrs = branches[0][0]->attrs.as(); ICHECK(origin_attrs); const auto dense_attrs = make_object(); diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc index e3113dbfe54c..fc7e82bed4e2 100644 --- a/src/runtime/graph_executor/graph_executor.cc +++ b/src/runtime/graph_executor/graph_executor.cc @@ -674,9 +674,7 @@ PackedFunc GraphExecutor::GetFunction(const std::string& name, }); } else if (name == "get_input_info") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { - GraphExecutor::ShapeInfo shape_info; - GraphExecutor::DtypeInfo dtype_info; - std::tie(shape_info, dtype_info) = this->GetInputInfo(); + auto [shape_info, dtype_info] = this->GetInputInfo(); Map input_info; input_info.Set("shape", shape_info); input_info.Set("dtype", dtype_info); diff --git a/src/target/source/ptx.cc b/src/target/source/ptx.cc index c5e3bf98ec2d..881c425e7742 100644 --- a/src/target/source/ptx.cc +++ b/src/target/source/ptx.cc @@ -403,8 +403,7 @@ class Replacer { } std::string rewrite(std::string str) { for (auto&& rule : _rules) { - std::string pattern, replacement; - std::tie(pattern, replacement) = rule; + auto [pattern, replacement] = rule; size_t len = pattern.size(); size_t new_len = replacement.size(); size_t pos = str.find(pattern); @@ -532,8 +531,7 @@ std::string PrintMMAAssembly(const std::string& shape, const std::string& A_layo dtype_c = ptx::DTypeFromString(C_dtype); ptx::LayoutType layout_a = ptx::LayoutTypeFromString(A_layout), layout_b = ptx::LayoutTypeFromString(B_layout); - int m, n, k; - std::tie(m, n, k) = ptx::ParseMMAShape(shape); + auto [m, n, k] = ptx::ParseMMAShape(shape); CheckMMAConfigValidity(m, n, k, layout_a, layout_b, dtype_a, dtype_b, dtype_c, bit_op, sparse, saturate); std::string asm_code = R"( @@ -545,8 +543,7 @@ std::string PrintMMAAssembly(const std::string& shape, const std::string& A_layo : {inputs}); } )"; - std::string templates_str, inputs_str, outputs_str; - std::tie(templates_str, inputs_str, outputs_str) = + auto [templates_str, inputs_str, outputs_str] = GetMMAOperands(m, n, k, dtype_a, dtype_b, dtype_c, sparse); // replace patterns @@ -622,8 +619,7 @@ std::string PrintLoadMatrixAssembly(bool trans, int num, const std::string& type ); } )"; - std::string templates_str, outputs_str; - std::tie(templates_str, outputs_str) = GetLoadMatrixOperands(num, local_ptr, local_elem_offset); + auto [templates_str, outputs_str] = GetLoadMatrixOperands(num, local_ptr, local_elem_offset); Replacer replacer; replacer.register_rule("{.shape}", ".m8n8"); diff --git a/src/te/autodiff/ad_simplify.cc b/src/te/autodiff/ad_simplify.cc index 28f57c77da70..26047e879e9b 100644 --- a/src/te/autodiff/ad_simplify.cc +++ b/src/te/autodiff/ad_simplify.cc @@ -1183,21 +1183,19 @@ PrimExpr RemoveJacobianAndLiftNonzeroCondImpl(const PrimExpr& expr_orig, const A return RemoveJacobianAndLiftNonzeroCondImpl(new_red, axis, vranges); } - PrimExpr new_outer_cond, new_reduce_cond; Array new_source = red->source; // Partially lift conditions from the reduce condition - std::tie(new_outer_cond, new_reduce_cond) = + auto [new_outer_cond, new_reduce_cond] = LiftConditionsThroughReduction(red->condition, red->axis, axis); // If it's not sum then we haven't yet lifted nonzeroness cond from the source if (!is_sum) { - PrimExpr outer_nz_cond, nz_cond, nz_source; auto nz = NonzeronessCondition(red->source[red->value_index]); // Append conditions from the reduction - nz_cond = new_reduce_cond && nz.cond; - nz_source = nz.value; - std::tie(outer_nz_cond, nz_cond) = LiftConditionsThroughReduction(nz_cond, red->axis, axis); + PrimExpr nz_source = nz.value; + auto [outer_nz_cond, nz_cond] = + LiftConditionsThroughReduction(new_reduce_cond && nz.cond, red->axis, axis); new_outer_cond = new_outer_cond && outer_nz_cond; new_source.Set(red->value_index, Select(nz_cond, nz_source, make_zero(nz_source.dtype()))); } diff --git a/src/te/autodiff/ad_utils.cc b/src/te/autodiff/ad_utils.cc index 268abab9cacb..0d1e4927cdfe 100644 --- a/src/te/autodiff/ad_utils.cc +++ b/src/te/autodiff/ad_utils.cc @@ -47,9 +47,7 @@ std::pair, Map> CloneIterVars(const Array PrimExpr CloneReduction(const PrimExpr& expr) { if (const ReduceNode* red = expr.as()) { - Array new_axis; - Map vmap; - std::tie(new_axis, vmap) = CloneIterVars(red->axis); + auto [new_axis, vmap] = CloneIterVars(red->axis); Array src_with_newaxis; for (const auto& src : red->source) { @@ -71,9 +69,7 @@ Operation ComputeOpFromExprs(const Array& exprs, const Array& const std::string& name, const std::string& tag, const Map& attrs, bool clone_axis) { if (clone_axis) { - Array new_axis = axis; - Map vmap; - std::tie(new_axis, vmap) = CloneIterVars(axis); + auto [new_axis, vmap] = CloneIterVars(axis); Array new_exprs; for (const PrimExpr& e : exprs) { new_exprs.push_back(Substitute(CloneReduction(e), vmap)); diff --git a/src/te/autodiff/jacobian.cc b/src/te/autodiff/jacobian.cc index 7104424957af..e61a590c409d 100644 --- a/src/te/autodiff/jacobian.cc +++ b/src/te/autodiff/jacobian.cc @@ -317,9 +317,7 @@ Tensor Jacobian(const Tensor& output, const Tensor& input) { // We have to clone the iteration axes because otherwise the original expression // cannot be used together with the derivative (it will lead to errors during lowering) - Array new_axis; - Map vmap; - std::tie(new_axis, vmap) = te::CloneIterVars(op->axis); + auto [new_axis, vmap] = te::CloneIterVars(op->axis); Array input_indices; size_t i = 0; diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc index b9e99257f37c..fb09a3480a3a 100644 --- a/src/tir/schedule/analysis/analysis.cc +++ b/src/tir/schedule/analysis/analysis.cc @@ -558,9 +558,13 @@ bool IsWriteCache(const StmtSRef& block_sref) { } const BufferRegion& write_region = block->writes[0]; for (const BufferRegion& read_region : block->reads) { - bool exists, surjective, injective, ordered, no_const_read, no_shift_read; - std::tie(exists, surjective, injective, ordered, no_const_read, no_shift_read) = + auto [exists, surjective, injective, ordered, no_const_read, no_shift_read] = AnalyzeReadWritePattern(read_region, write_region); + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767 + (void)exists; + (void)surjective; + (void)no_const_read; + (void)no_shift_read; if (!(injective && ordered)) { return false; } @@ -2118,8 +2122,7 @@ bool NeedsRFactorOrCrossThreadReduction(const tir::ScheduleState& self, // } // Cond 6. Can successfully calculating the cumulative loop length. - int64_t cum_space_len, cum_reduce_len; - std::tie(cum_space_len, cum_reduce_len) = GetCumulativeSpaceAndReductionLength(self, block_sref); + auto [cum_space_len, cum_reduce_len] = GetCumulativeSpaceAndReductionLength(self, block_sref); if (cum_space_len == -1 || cum_reduce_len == -1) { return false; } diff --git a/src/tir/schedule/primitive/block_annotate.cc b/src/tir/schedule/primitive/block_annotate.cc index 31c938313fed..0912e36836e3 100644 --- a/src/tir/schedule/primitive/block_annotate.cc +++ b/src/tir/schedule/primitive/block_annotate.cc @@ -82,9 +82,7 @@ class NonAllocatedBufferError : public ScheduleError { static StmtSRef CheckAndGetBufferAllocationSite(const IRModule& mod, const StmtSRef& block_sref, const Buffer& buffer) { - Optional defining_site_sref; - bool is_alloc; - std::tie(defining_site_sref, is_alloc) = GetBufferDefiningSite(block_sref, buffer); + auto [defining_site_sref, is_alloc] = GetBufferDefiningSite(block_sref, buffer); if (!defining_site_sref.defined() || !is_alloc) { throw NonAllocatedBufferError(mod, buffer); } diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc index b4e40fa120fe..8e2643db0103 100644 --- a/src/tir/schedule/primitive/layout_transformation.cc +++ b/src/tir/schedule/primitive/layout_transformation.cc @@ -137,9 +137,7 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_ const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref); Buffer old_buffer = GetNthAccessBuffer(self, GetRef(block_ptr), buffer_index, buffer_index_type); - Optional defining_site_sref; - bool is_alloc; - std::tie(defining_site_sref, is_alloc) = GetBufferDefiningSite(block_sref, old_buffer); + auto [defining_site_sref, is_alloc] = GetBufferDefiningSite(block_sref, old_buffer); if (defining_site_sref.defined() && !is_alloc) { throw BufferIsSubregionError(self->mod, old_buffer); } @@ -155,9 +153,7 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_ Buffer new_buffer{new_buffer_node}; // Step 2: Rewrite access indices and regions of the buffer - Stmt new_stmt; - Map block_sref_reuse; - std::tie(new_stmt, block_sref_reuse) = TransformLayoutRewriter::Rewrite( + auto [new_stmt, block_sref_reuse] = TransformLayoutRewriter::Rewrite( GetRef(scope_block), old_buffer, new_buffer, index_map); Block new_scope_block = Downcast(new_stmt); @@ -492,9 +488,7 @@ void SetAxisSeparator(ScheduleState self, const StmtSRef& block_sref, int buffer const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref); Buffer old_buffer = GetNthAccessBuffer(self, GetRef(block_ptr), buffer_index, buffer_index_type); - Optional defining_site_sref; - bool is_alloc; - std::tie(defining_site_sref, is_alloc) = GetBufferDefiningSite(block_sref, old_buffer); + auto [defining_site_sref, is_alloc] = GetBufferDefiningSite(block_sref, old_buffer); if (defining_site_sref.defined() && !is_alloc) { throw BufferIsSubregionError(self->mod, old_buffer); } diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc index 2db3eb902aba..992817e87e2d 100644 --- a/src/tir/schedule/primitive/loop_transformation.cc +++ b/src/tir/schedule/primitive/loop_transformation.cc @@ -704,9 +704,7 @@ void Reorder(ScheduleState self, const Array& ordered_loop_srefs) { // the input array // - the bottom of the reorder range is the last loop in the input array which is not visited in // the previous traversals - const StmtSRefNode* top = nullptr; - const StmtSRefNode* bottom = nullptr; - std::tie(top, bottom) = GetBoundaryOfReorderRange(self, loop_srefs); + auto [top, bottom] = GetBoundaryOfReorderRange(self, loop_srefs); // Step 3. Collect all loops in the chain and check the loops are single-branch std::vector chain = GetLoopsInReorderRange(self, top, bottom); // Step 4. Check the block below has all its block_var to be data-parallel or reduction, diff --git a/src/tir/schedule/primitive/reduction.cc b/src/tir/schedule/primitive/reduction.cc index 7a4ace736e48..1198e67d710a 100644 --- a/src/tir/schedule/primitive/reduction.cc +++ b/src/tir/schedule/primitive/reduction.cc @@ -278,9 +278,7 @@ StmtSRef DecomposeReduction(ScheduleState self, const StmtSRef& block_sref, body = Substitute(body, loop_var_map); // Step 6. Mutate IR const BlockNode* old_scope_root = TVM_SREF_TO_BLOCK(scope_root_sref); - Block new_scope_root{nullptr}; - Block new_reduction_block{nullptr}; - std::tie(new_scope_root, new_reduction_block) = DecomposeReductionBlockReplacer::Replace( + auto [new_scope_root, new_reduction_block] = DecomposeReductionBlockReplacer::Replace( GetRef(old_scope_root), GetRef(loop), body, GetRef(block)); self->Replace(scope_root_sref, new_scope_root, {{GetRef(old_scope_root), new_scope_root}, @@ -1042,12 +1040,8 @@ StmtSRef RFactor(ScheduleState self, const StmtSRef& rf_loop_sref, int factor_ax // commutative reducer, combiner lhs and combiner rhs from the reduction identity and the // reduction combiner. The lhs will be used when constructing the write-back block, and the rhs // will be used when constructing the rfactor block. - BufferStore init; - BufferStore update; - CommReducer reducer; - PrimExpr combiner_lhs, combiner_rhs; - std::tie(init, update) = GetBufferStoresFromReductionBlock(self, block); - std::tie(reducer, combiner_lhs, combiner_rhs) = + auto [init, update] = GetBufferStoresFromReductionBlock(self, block); + auto [reducer, combiner_lhs, combiner_rhs] = GetReducerAndCombinerLhsRhs(self, init->value, update); // Step 6. Check whether `factor_axis` is in a correct range, and convert it to non-negative if it diff --git a/src/tir/schedule/primitive/sampling.cc b/src/tir/schedule/primitive/sampling.cc index 52b5add2bc9e..b1001a7f9455 100644 --- a/src/tir/schedule/primitive/sampling.cc +++ b/src/tir/schedule/primitive/sampling.cc @@ -348,9 +348,7 @@ tir::StmtSRef SampleComputeLocation(tir::ScheduleState self, support::LinearCongruentialEngine::TRandState* rand_state, const StmtSRef& block_sref, Optional* decision) { // Step 1. Collect all possible compute-at locations. - Array location_srefs; - std::vector location_indices; - std::tie(location_srefs, location_indices) = CollectComputeLocation(self, block_sref); + auto [location_srefs, location_indices] = CollectComputeLocation(self, block_sref); ICHECK_EQ(location_srefs.size(), location_indices.size()); // Step 2. If there was a previous decision, keep the decision unchanged if it exists in the diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc index 677506889e57..6ecc6459b904 100644 --- a/src/tir/transforms/loop_partition.cc +++ b/src/tir/transforms/loop_partition.cc @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -553,25 +554,39 @@ Stmt LoopPartitioner::TryPartition(const Stmt& stmt, Var var, PrimExpr min, Prim if (finder.partitions.empty()) return Stmt(); arith::IntervalSet for_interval(min, max); - bool cond_value; - IntSet middle_interval; - ExpressionSet cond_set; - // find an interval in which all conditions on var are true - std::tie(middle_interval, cond_set) = - GetIntervalAndCondset(finder.partitions, for_interval, true, has_partition_hint_); - if (middle_interval.IsNothing()) { - // if such interval doesn't exist, find an interval in which all - // conditions on var are false - std::tie(middle_interval, cond_set) = - GetIntervalAndCondset(finder.partitions, for_interval, false, has_partition_hint_); - if (middle_interval.IsNothing()) - // we couldn't find an interval in which the conditions are provably true or false - // Therefore, we can't partition the loop based on those conds - return Stmt(); - cond_value = false; - } else { - cond_value = true; + + auto [middle_interval, cond_set, + opt_cond_value] = [&]() -> std::tuple> { + { + // find an interval in which all conditions on var are true + auto [middle_interval, cond_set] = + GetIntervalAndCondset(finder.partitions, for_interval, true, has_partition_hint_); + if (!middle_interval.IsNothing()) { + return {middle_interval, cond_set, true}; + } + } + + { + // if such interval doesn't exist, find an interval in which all + // conditions on var are false + auto [middle_interval, cond_set] = + GetIntervalAndCondset(finder.partitions, for_interval, false, has_partition_hint_); + + if (!middle_interval.IsNothing()) { + return {middle_interval, cond_set, false}; + } + } + + // we couldn't find an interval in which the conditions are + // provably true or false. Therefore, we can't partition the loop + // based on those conds + return {{}, {}, std::nullopt}; + }(); + + if (!opt_cond_value.has_value()) { + return Stmt(); } + bool cond_value = opt_cond_value.value(); IntervalSet middle_interval_i = Downcast(middle_interval); // middle_interval is the subrange of the loop variable range for which a diff --git a/src/tir/transforms/lower_cross_thread_reduction.cc b/src/tir/transforms/lower_cross_thread_reduction.cc index df8bf69e7468..04b025b5f9ae 100644 --- a/src/tir/transforms/lower_cross_thread_reduction.cc +++ b/src/tir/transforms/lower_cross_thread_reduction.cc @@ -497,14 +497,10 @@ class CrossThreadReductionTransformer : public StmtMutator { // both be BufferStores with the same buffer and indices; // Extract the commutative reducer, combiner lhs and combiner rhs from the reduction identity // and the reduction combiner. - BufferStore init{nullptr}; - BufferStore update{nullptr}; - CommReducer reducer{nullptr}; - PrimExpr combiner_lhs{nullptr}; - PrimExpr combiner_rhs{nullptr}; - std::tie(init, update) = GetBufferStoresFromReductionBlock(NullOpt, GetRef(block)); - std::tie(reducer, combiner_lhs, combiner_rhs) = + auto [init, update] = GetBufferStoresFromReductionBlock(NullOpt, GetRef(block)); + auto [reducer, combiner_lhs, combiner_rhs] = GetReducerAndCombinerLhsRhs(NullOpt, init->value, update); + (void)combiner_lhs; // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767 // Condition 5. The block should be the last block under the first reduction-related loop. bool visit = false; @@ -577,10 +573,7 @@ class CrossThreadReductionTransformer : public StmtMutator { ++reduction_id_; // Step 2. Check whether cross-thread reduction can be applied. If no, throw an exception on // which condition the block violates. - int n_bound_reduction_loops = 0; - CommReducer reducer{nullptr}; - PrimExpr combiner_rhs{nullptr}; - std::tie(n_bound_reduction_loops, reducer, combiner_rhs) = + auto [n_bound_reduction_loops, reducer, combiner_rhs] = CheckCanApplyCrossThreadReduction(block, reduction_loops); // Step 3. Before doing the cross-thread reduction, in-thread reduction is needed when // - not all the reduction-related loops are bound to thread axes, or diff --git a/src/tir/transforms/lower_thread_allreduce.cc b/src/tir/transforms/lower_thread_allreduce.cc index 43f7a103db7f..bd6b5185eb4a 100644 --- a/src/tir/transforms/lower_thread_allreduce.cc +++ b/src/tir/transforms/lower_thread_allreduce.cc @@ -301,9 +301,8 @@ class ThreadAllreduceBuilder final : public StmtExprMutator { // sort according to dim_index std::sort(block_threads.begin(), block_threads.end()); for (auto&& thr_attr : block_threads) { - int dim_index, extent; - bool is_reduce; - std::tie(dim_index, extent, is_reduce) = thr_attr; + auto [dim_index, extent, is_reduce] = thr_attr; + (void)dim_index; // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767 if (is_reduce) { contiguous_reduce_extent *= extent; } else { diff --git a/src/tir/transforms/lower_warp_memory.cc b/src/tir/transforms/lower_warp_memory.cc index 408cdbd04ec7..e12e2772ab22 100644 --- a/src/tir/transforms/lower_warp_memory.cc +++ b/src/tir/transforms/lower_warp_memory.cc @@ -311,8 +311,8 @@ class WarpAccessRewriter : protected StmtExprMutator { << "Has StorageFlatten (TE-based schedule) or " << "FlattenBuffer (TIR-based schedules) been run?"; - PrimExpr local_index, group; - std::tie(local_index, group) = SplitIndexByGroup(store->indices[0]); + auto [local_index, group] = SplitIndexByGroup(store->indices[0]); + (void)group; // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767 auto writer = store.CopyOnWrite(); writer->indices = {local_index}; @@ -332,8 +332,7 @@ class WarpAccessRewriter : protected StmtExprMutator { << "Has StorageFlatten (TE-based schedule) or " << "FlattenBuffer (TIR-based schedules) been run?"; - PrimExpr local_index, group; - std::tie(local_index, group) = SplitIndexByGroup(op->indices[0]); + auto [local_index, group] = SplitIndexByGroup(op->indices[0]); // invariance: local index must do not contain warp id ICHECK(!UsesVar(local_index, [this](const VarNode* var) { return var == warp_index_.get(); })) << "LowerWarpMemory failed to rewrite load to shuffle for index " << op->indices[0] @@ -357,12 +356,10 @@ class WarpAccessRewriter : protected StmtExprMutator { // in this access pattern. std::pair SplitIndexByGroup(const PrimExpr& index) { if (index.dtype().lanes() != 1) { - PrimExpr local_index, group; - arith::PVar base; ICHECK(arith::ramp(base, 1, index.dtype().lanes()).Match(index)); - std::tie(local_index, group) = SplitIndexByGroup(base.Eval()); + auto [local_index, group] = SplitIndexByGroup(base.Eval()); local_index = Ramp(local_index, make_const(local_index.dtype(), 1), index.dtype().lanes()); return std::make_pair(local_index, group); } diff --git a/src/tir/transforms/manifest_shared_memory_local_stage.cc b/src/tir/transforms/manifest_shared_memory_local_stage.cc index 16c85642d1e5..0f56c8b8b7c9 100644 --- a/src/tir/transforms/manifest_shared_memory_local_stage.cc +++ b/src/tir/transforms/manifest_shared_memory_local_stage.cc @@ -61,9 +61,7 @@ class IntermediateStageRewriter { std::vector relaxed_loops = CollectRelaxedOuterLoops(block, target_buffer); // Step 1: Create buffer for the local stage - Buffer new_buffer{nullptr}; - Array buffer_indices; - std::tie(new_buffer, buffer_indices) = CreateIntermediateBuffer(relaxed_loops, target_buffer); + auto [new_buffer, buffer_indices] = CreateIntermediateBuffer(relaxed_loops, target_buffer); // Step 2: Create the local stage block Stmt local_stage = MakeLocalStage(block, new_buffer, buffer_indices, relaxed_loops, store); @@ -190,12 +188,8 @@ class SharedMemoryLocalStageInserter : public StmtMutator { // The annotated block must be a leaf block (will be checked during rewriting). No need to // visit its body recursively. - Buffer target_buffer{nullptr}; - Buffer new_buffer{nullptr}; - Block new_block{nullptr}; - Stmt local_stage{nullptr}; IntermediateStageRewriter rewriter(ancestor_loop_or_blocks_); - std::tie(target_buffer, new_buffer, new_block, local_stage) = rewriter.Rewrite(op); + auto [target_buffer, new_buffer, new_block, local_stage] = rewriter.Rewrite(op); buffer_remap_.Set(target_buffer, new_buffer); new_block.CopyOnWrite()->annotations.erase(attr::manifest_shared_memory_local_stage); From c5c99a4b523c9165adb4d552d284f8666520336f Mon Sep 17 00:00:00 2001 From: zhaoyang-star Date: Mon, 29 Aug 2022 18:31:00 +0800 Subject: [PATCH 065/704] [QNN] Align output_scale/zero_point of sigmoid to Torch (#12624) * [QNN] Align output_scale/zero_point of sigmoid to Torch * [QNN] Align output_scale/zero_point of sigmoid to Torch --- python/tvm/relay/frontend/pytorch.py | 6 ++-- python/tvm/relay/frontend/qnn_torch.py | 40 ++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index 9f808203a6e1..2255396c0633 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -1565,10 +1565,8 @@ def func(x): return _op.tensor.sigmoid(x) if self.is_quantized_tensor(data): - assert len(inputs) == 3, "Input quant param not found in op inputs" - input_scale = _expr.const(inputs[1]) - input_zero_point = _expr.const(inputs[2]) - return qnn_torch.quantized_sigmoid(data, input_scale, input_zero_point) + assert len(inputs) == 5, "Input/Ouput quant param not found in op inputs" + return qnn_torch.quantized_sigmoid(inputs) return func(data) diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py index c2e233d5961e..45cb8dedfd53 100644 --- a/python/tvm/relay/frontend/qnn_torch.py +++ b/python/tvm/relay/frontend/qnn_torch.py @@ -272,6 +272,7 @@ def _get_quant_param_for_input(input_value): "quantized::hardswish": (1, 2), "quantized::conv_transpose2d": qconv_indices, "quantized::leaky_relu": (3, 4), + "aten::sigmoid": (1, 2), } def dfs(current_node): @@ -395,6 +396,33 @@ def _add_output_quant_params_to_scalar_op(node, graph, input_scale, input_zero_p node.addInput(out_zero_point_node.output()) +def _add_output_quant_params_to_sigmoid_op(node, graph): + """ + Refer to aten/src/ATen/native/quantized/cpu/qsigmoid.cpp, + the output scale and zp of sigmoid op are two fixed numbers. + So we need to make two new constant nodes in the input IR and + add these params to the inputs of sigmoid op. + """ + # pylint: disable=c-extension-no-member + import torch + + # suppose scale_type is uint8 + out_scale = 1.0 / 256 + out_zero_point = 0 + + # create new constant nodes and add them to graph + out_scale_node = graph.create("prim::Constant") + out_zero_point_node = graph.create("prim::Constant") + out_scale_node.insertBefore(node) + out_zero_point_node.insertBefore(node) + out_scale_node.f_("value", out_scale) + out_zero_point_node.i_("value", out_zero_point) + out_scale_node.output().setType(torch._C.FloatType.get()) + out_zero_point_node.output().setType(torch._C.IntType.get()) + node.addInput(out_scale_node.output()) + node.addInput(out_zero_point_node.output()) + + def add_input_quant_params_to_op_inputs(graph): """ In Torch, input quant params are not explicitly passed around @@ -483,6 +511,9 @@ def add_input_quant_params_to_op_inputs(graph): # see the comments in this function above _add_output_quant_params_to_scalar_op(node, graph, inp_scale, inp_zero_point, scalar) + if operator == "aten::sigmoid": + _add_output_quant_params_to_sigmoid_op(node, graph) + for scale, zp in zip(input_scales, input_zero_points): node.addInput(scale) node.addInput(zp) @@ -571,9 +602,12 @@ def quantized_relu(data, input_zero_point): return _op.tensor.maximum(data, zp) -def quantized_sigmoid(data, input_scale, input_zero_point): - output_scale = input_scale - output_zero_point = input_zero_point +def quantized_sigmoid(inputs): + data = inputs[0] + output_scale = _expr.const(inputs[1]) + output_zero_point = _expr.const(inputs[2]) + input_scale = _expr.const(inputs[3]) + input_zero_point = _expr.const(inputs[4]) return relay.qnn.op.sigmoid( data, input_scale, input_zero_point, output_scale, output_zero_point ) From 0de22196db5f818a6937f026db43785935b9e731 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Mon, 29 Aug 2022 09:59:10 -0700 Subject: [PATCH 066/704] [microTVM][Zephyr] Disable test_armv7m_intrinsic since it's broken (#12620) add xfail --- tests/micro/zephyr/test_zephyr_armv7m.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/micro/zephyr/test_zephyr_armv7m.py b/tests/micro/zephyr/test_zephyr_armv7m.py index 1f6a1a1bc13e..6a1dff254591 100644 --- a/tests/micro/zephyr/test_zephyr_armv7m.py +++ b/tests/micro/zephyr/test_zephyr_armv7m.py @@ -104,12 +104,12 @@ def _apply_desired_layout_no_simd(relay_mod): @tvm.testing.requires_micro @pytest.mark.skip_boards(["mps2_an521"]) +@pytest.mark.xfail(reason="due https://github.com/apache/tvm/issues/12619") def test_armv7m_intrinsic(workspace_dir, board, west_cmd, microtvm_debug): """Testing a ARM v7m SIMD extension.""" - if board not in [ "mps2_an521", - "stm32f746xx_disco", + "stm32f746g_disco", "nucleo_f746zg", "nucleo_l4r5zi", "nrf5340dk_nrf5340_cpuapp", From c31a762b985894f64d3a80407b75fadb60240862 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Mon, 29 Aug 2022 11:00:54 -0700 Subject: [PATCH 067/704] [ci] Don't update Jenkinsfile timestamp on image updates (#12621) The timestamp in the Jenkinsfile is there to prevent post-merge conflicts from different PRs that edit the templates merging non-sequentially. This is not an issue when a line is edited in place though, which is often the case when Docker image tags are updated. This PR makes it so the timestamp is not updated in these cases which should reduce merge conflicts on these types of PRs. --- ci/jenkins/generate.py | 62 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/ci/jenkins/generate.py b/ci/jenkins/generate.py index 901d413364b3..3ccdedc6d924 100644 --- a/ci/jenkins/generate.py +++ b/ci/jenkins/generate.py @@ -18,11 +18,12 @@ import jinja2 import argparse import difflib -import re import datetime +import re import textwrap from pathlib import Path +from typing import List REPO_ROOT = Path(__file__).resolve().parent.parent.parent @@ -82,9 +83,51 @@ def lines_without_generated_tag(content): ] +def is_changed_images_only(lines: List[str]) -> bool: + """ + Return True if 'line' only edits an image tag or if 'line' is not a changed + line in a diff + """ + added_images = [] + removed_images = [] + diff_lines = [] + + for line in lines[2:]: + if not line.startswith("-") and not line.startswith("+"): + # not a diff line, ignore it + continue + + diff_lines.append(line) + + if len(diff_lines) == 0: + # no changes made + return True + + for line in diff_lines: + is_add = line.startswith("+") + line = line.strip().lstrip("+").lstrip("-") + match = re.search( + r"^(ci_[a-zA-Z0-9]+) = \'.*\'$", + line.strip().lstrip("+").lstrip("-"), + flags=re.MULTILINE, + ) + if match is None: + # matched a non-image line, quit early + return False + + if is_add: + added_images.append(match.groups()[0]) + else: + removed_images.append(match.groups()[0]) + + # make sure that the added image lines match the removed image lines + return len(added_images) > 0 and added_images == removed_images + + if __name__ == "__main__": help = "Regenerate Jenkinsfile from template" parser = argparse.ArgumentParser(description=help) + parser.add_argument("--force", action="store_true", help="always overwrite timestamp") parser.add_argument("--check", action="store_true", help="just verify the output didn't change") args = parser.parse_args() @@ -92,6 +135,10 @@ def lines_without_generated_tag(content): content = f.read() data["generated_time"] = datetime.datetime.now().isoformat() + timestamp_match = re.search(r"^// Generated at (.*)$", content, flags=re.MULTILINE) + if not timestamp_match: + raise RuntimeError("Could not find timestamp in Jenkinsfile") + original_timestamp = timestamp_match.groups()[0] environment = jinja2.Environment( loader=jinja2.FileSystemLoader(REPO_ROOT), @@ -103,11 +150,18 @@ def lines_without_generated_tag(content): template = environment.get_template(str(JENKINSFILE_TEMPLATE.relative_to(REPO_ROOT))) new_content = template.render(**data) - diff = "".join( - difflib.unified_diff( + diff = [ + line + for line in difflib.unified_diff( lines_without_generated_tag(content), lines_without_generated_tag(new_content) ) - ) + ] + if not args.force and is_changed_images_only(diff): + new_content = new_content.replace(data["generated_time"], original_timestamp) + print("Detected only Docker-image name changed, skipping timestamp update") + + diff = "".join(diff) + if args.check: if not diff: print("Success, the newly generated Jenkinsfile matched the one on disk") From 74988d36bd578b791bbdcea383d343d62029e9cf Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Mon, 29 Aug 2022 14:33:04 -0700 Subject: [PATCH 068/704] [Utils] Handled Callable in tir.schedule._type_checker (#12633) Previously, `Callable` was handled as an atomic type. This worked when it was included as last element of a `Union[]` annotation with no subtypes, but raised an error for other use cases, including `Optional[Callable]`. This commit adds explicit checks for `Callable` type annotations to validate whether the argument is callable, but doesn't recursively validate the signature of the callable object, because lambda functions cannot have type annotations. (https://peps.python.org/pep-3107/#lambda) --- python/tvm/tir/schedule/_type_checker.py | 40 ++++++++++ .../unittest/test_type_annotation_checker.py | 77 +++++++++++++++---- 2 files changed, 103 insertions(+), 14 deletions(-) diff --git a/python/tvm/tir/schedule/_type_checker.py b/python/tvm/tir/schedule/_type_checker.py index d45b4fb84b27..0b48dfc2b0e6 100644 --- a/python/tvm/tir/schedule/_type_checker.py +++ b/python/tvm/tir/schedule/_type_checker.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. """Type checking functionality""" +import collections +import collections.abc import functools import inspect from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, Union @@ -26,6 +28,7 @@ def _is_none_type(type_: Any) -> bool: if hasattr(typing, "_GenericAlias"): + # For python versions 3.7 onward, check the __origin__ attribute. class _Subtype: @staticmethod @@ -71,7 +74,15 @@ def union(type_: Any) -> Optional[List[type]]: return list(subtypes) return None + @staticmethod + def callable(type_: Any) -> Optional[List[type]]: + if _Subtype._origin(type_) is collections.abc.Callable: + subtypes = type_.__args__ + return subtypes + return None + elif hasattr(typing, "_Union"): + # For python 3.6 and below, check the __name__ attribute, or CallableMeta. class _Subtype: # type: ignore @staticmethod @@ -114,6 +125,13 @@ def union(type_: Any) -> Optional[List[type]]: return list(subtypes) return None + @staticmethod + def callable(type_: Any) -> Optional[List[type]]: + if isinstance(type_, typing.CallableMeta): # type: ignore # pylint: disable=no-member,protected-access + subtypes = type_.__args__ + return subtypes + return None + def _dispatcher(type_: Any) -> Tuple[str, List[type]]: if _is_none_type(type_): @@ -139,12 +157,27 @@ def _dispatcher(type_: Any) -> Tuple[str, List[type]]: if subtype is not None: return "union", subtype + subtype = _Subtype.callable(type_) + if subtype is not None: + return "callable", subtype + return "atomic", [type_] +def callable_str(subtypes): + if subtypes: + *arg_types, return_type = subtypes + arg_str = ", ".join(_type2str(arg_type) for arg_type in arg_types) + return_type_str = _type2str(return_type) + return f"Callable[[{arg_str}], {return_type_str}]" + else: + return "Callable" + + _TYPE2STR: Dict[Any, Callable] = { "none": lambda: "None", "atomic": lambda t: str(t.__name__), + "callable": callable_str, "list": lambda t: f"List[{_type2str(t)}]", "dict": lambda k, v: f"Dict[{_type2str(k)}, {_type2str(v)}]", "tuple": lambda *t: f"Tuple[{', '.join([_type2str(x) for x in t])}]", @@ -188,6 +221,12 @@ def _type_check_none(v: Any, name: str) -> Optional[str]: def _type_check_atomic(v: Any, name: str, type_: Any) -> Optional[str]: return None if isinstance(v, type_) else _type_check_err(v, name, type_) + def _type_check_callable(v: Any, name: str, *_subtypes: Any) -> Optional[str]: + # Current implementation only validates that the argument is + # callable, and doesn't validate the arguments accepted by the + # callable, if any. + return None if callable(v) else _type_check_err(v, name, Callable) + def _type_check_list(v: List[Any], name: str, type_: Any) -> Optional[str]: if not isinstance(v, (list, tuple)): return _type_check_err(v, name, list) @@ -234,6 +273,7 @@ def _type_check_union(v: Any, name: str, *types: Any) -> Optional[str]: return { "none": _type_check_none, "atomic": _type_check_atomic, + "callable": _type_check_callable, "list": _type_check_list, "dict": _type_check_dict, "tuple": _type_check_tuple, diff --git a/tests/python/unittest/test_type_annotation_checker.py b/tests/python/unittest/test_type_annotation_checker.py index e84ae043d356..204c15331339 100644 --- a/tests/python/unittest/test_type_annotation_checker.py +++ b/tests/python/unittest/test_type_annotation_checker.py @@ -17,13 +17,22 @@ """Test type checker based on python's type annotations""" import sys -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Tuple, Union, Callable import pytest +import _pytest from tvm.tir.schedule._type_checker import type_checked +def int_func(x: int) -> int: + return 2 * x + + +def str_func(x: str) -> str: + return 2 * x + + test_cases = [ { "type_annotation": int, @@ -90,30 +99,71 @@ None, ], }, + { + "type_annotation": Callable, + "positive_cases": [str_func, int_func], + "negative_cases": [ + None, + "x", + 42, + ], + }, + { + "type_annotation": Callable[[int], int], + "positive_cases": [int_func], + "negative_cases": [ + None, + "x", + 42, + pytest.param( + str_func, + marks=pytest.mark.xfail( + reason="Signature of Callable arguments not currently checked" + ), + ), + ], + }, ] -positive_cases = [ - (config["type_annotation"], case) for config in test_cases for case in config["positive_cases"] -] - -negative_cases = [ - (config["type_annotation"], case) for config in test_cases for case in config["negative_cases"] -] +def make_parametrization(type_annotation, case): + if isinstance(case, _pytest.mark.structures.ParameterSet): + marks = case.marks + (case,) = case.values + else: + marks = [] -def format_name(type_annotation, case): try: - name = type_annotation.__name__ + annotation_name = type_annotation.__name__ except AttributeError: - name = str(type_annotation).replace("typing.", "") + annotation_name = str(type_annotation).replace("typing.", "") + + if hasattr(case, "__name__"): + case_name = case.__name__ + else: + case_name = str(case) - return f"{name}_{case}" + name = f"{annotation_name}, {case_name}" + + return pytest.param(type_annotation, case, marks=marks, id=name) + + +positive_cases = [ + make_parametrization(config["type_annotation"], case) + for config in test_cases + for case in config["positive_cases"] +] + +negative_cases = [ + make_parametrization(config["type_annotation"], case) + for config in test_cases + for case in config["negative_cases"] +] @pytest.mark.parametrize( ["type_annotation", "case"], positive_cases, - ids=[format_name(t, c) for t, c in positive_cases], ) def test_matches_type(type_annotation, case): @type_checked @@ -126,7 +176,6 @@ def func(_: type_annotation): @pytest.mark.parametrize( ["type_annotation", "case"], negative_cases, - ids=[format_name(t, c) for t, c in negative_cases], ) def test_not_matches(type_annotation, case): @type_checked From 9e88723385f83a2d27a60432cbe50782bed2885f Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Mon, 29 Aug 2022 17:27:34 -0700 Subject: [PATCH 069/704] [TIR] Improved error messages for PrimExpr operator overloads (#12638) Previously, type-checks in boolean operators on `PrimExpr` would state that the type is incorrect, but further investigation would be required in order to determine what expression caused the error. After this commit, error messages for these type checks include the expression that was used, and the dtype of that expression. --- src/tir/op/op.cc | 58 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc index 69d1da5e8c1c..b9e0c3c37068 100644 --- a/src/tir/op/op.cc +++ b/src/tir/op/op.cc @@ -520,10 +520,37 @@ PrimExpr not_equal(PrimExpr a, PrimExpr b, Span span) { return tir::NE(a, b, span); } +namespace { +void type_check_boolean_args(const PrimExpr& arg, const char* op) { + ICHECK(arg.dtype().is_bool()) << "Expected boolean argument for " << op << ", but received " + << arg << " of type " << arg.dtype(); +} +void type_check_boolean_args(const PrimExpr& lhs, const PrimExpr& rhs, const char* op) { + ICHECK(lhs.dtype().is_bool()) << "Expected boolean argument as LHS of " << op << ", but received " + << lhs << " of type " << lhs.dtype(); + ICHECK(rhs.dtype().is_bool()) << "Expected boolean argument as RHS of " << op << ", but received " + << rhs << " of type " << rhs.dtype(); +} + +void type_check_integer_args(const PrimExpr& arg, const char* op) { + ICHECK(arg.dtype().is_int() || arg.dtype().is_uint()) + << "Expected integer argument for " << op << ", but received " << arg << " of type " + << arg.dtype(); +} + +void type_check_integer_args(const PrimExpr& lhs, const PrimExpr& rhs, const char* op) { + ICHECK(lhs.dtype().is_int() || lhs.dtype().is_uint()) + << "Expected integer argument as LHS of " << op << ", but received " << lhs << " of type " + << lhs.dtype(); + ICHECK(rhs.dtype().is_int() || rhs.dtype().is_uint()) + << "Expected integer argument as RHS of " << op << ", but received " << rhs << " of type " + << rhs.dtype(); +} +} // namespace + PrimExpr operator&&(PrimExpr a, PrimExpr b) { return logical_and(a, b); } PrimExpr logical_and(PrimExpr a, PrimExpr b, Span span) { - ICHECK(a.dtype().is_bool()); - ICHECK(b.dtype().is_bool()); + type_check_boolean_args(a, b, "&& operator (logical AND)"); PrimExpr ret = arith::TryConstFold(a, b); if (ret.defined()) return ret; return tir::And(a, b, span); @@ -531,8 +558,7 @@ PrimExpr logical_and(PrimExpr a, PrimExpr b, Span span) { PrimExpr operator||(PrimExpr a, PrimExpr b) { return logical_or(a, b); } PrimExpr logical_or(PrimExpr a, PrimExpr b, Span span) { - ICHECK(a.dtype().is_bool()); - ICHECK(b.dtype().is_bool()); + type_check_boolean_args(a, b, "|| operator (logical OR)"); PrimExpr ret = arith::TryConstFold(a, b); if (ret.defined()) return ret; return tir::Or(a, b, span); @@ -540,7 +566,7 @@ PrimExpr logical_or(PrimExpr a, PrimExpr b, Span span) { PrimExpr operator!(PrimExpr a) { return logical_not(a); } PrimExpr logical_not(PrimExpr a, Span span) { - ICHECK(a.dtype().is_bool()); + type_check_boolean_args(a, "! operator (logical NOT)"); PrimExpr ret = arith::TryConstFold(a); if (ret.defined()) return ret; return tir::Not(a, span); @@ -550,8 +576,8 @@ PrimExpr logical_not(PrimExpr a, Span span) { PrimExpr operator>>(PrimExpr a, PrimExpr b) { return right_shift(a, b); } PrimExpr right_shift(PrimExpr a, PrimExpr b, Span span) { - ICHECK(a.dtype().is_int() || a.dtype().is_uint()); - ICHECK(b.dtype().is_int() || b.dtype().is_uint()); + type_check_integer_args(a, b, ">> operator (right shift)"); + BinaryOpMatchTypes(a, b, span); TVM_INDEX_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); @@ -573,8 +599,7 @@ PrimExpr right_shift(PrimExpr a, PrimExpr b, Span span) { // shift left PrimExpr operator<<(PrimExpr a, PrimExpr b) { return left_shift(a, b); } PrimExpr left_shift(PrimExpr a, PrimExpr b, Span span) { - ICHECK(a.dtype().is_int() || a.dtype().is_uint()); - ICHECK(b.dtype().is_int() || b.dtype().is_uint()); + type_check_integer_args(a, b, "<< operator (left shift)"); BinaryOpMatchTypes(a, b, span); TVM_INDEX_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); @@ -593,8 +618,7 @@ PrimExpr left_shift(PrimExpr a, PrimExpr b, Span span) { // bitwise and PrimExpr operator&(PrimExpr a, PrimExpr b) { return bitwise_and(a, b); } PrimExpr bitwise_and(PrimExpr a, PrimExpr b, Span span) { - ICHECK(a.dtype().is_int() || a.dtype().is_uint()); - ICHECK(b.dtype().is_int() || b.dtype().is_uint()); + type_check_integer_args(a, b, "& operator (bitwise AND)"); BinaryOpMatchTypes(a, b, span); TVM_INDEX_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); @@ -606,8 +630,7 @@ PrimExpr bitwise_and(PrimExpr a, PrimExpr b, Span span) { // bitwise_or PrimExpr operator|(PrimExpr a, PrimExpr b) { return bitwise_or(a, b); } PrimExpr bitwise_or(PrimExpr a, PrimExpr b, Span span) { - ICHECK(a.dtype().is_int() || a.dtype().is_uint()); - ICHECK(b.dtype().is_int() || b.dtype().is_uint()); + type_check_integer_args(a, b, "| operator (bitwise OR)"); BinaryOpMatchTypes(a, b, span); TVM_INDEX_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); @@ -619,8 +642,7 @@ PrimExpr bitwise_or(PrimExpr a, PrimExpr b, Span span) { // bitwise_xor PrimExpr operator^(PrimExpr a, PrimExpr b) { return bitwise_xor(a, b); } PrimExpr bitwise_xor(PrimExpr a, PrimExpr b, Span span) { - ICHECK(a.dtype().is_int() || a.dtype().is_uint()); - ICHECK(b.dtype().is_int() || b.dtype().is_uint()); + type_check_integer_args(a, b, "^ operator (bitwise XOR)"); BinaryOpMatchTypes(a, b, span); TVM_INDEX_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); @@ -633,7 +655,7 @@ PrimExpr bitwise_xor(PrimExpr a, PrimExpr b, Span span) { PrimExpr operator~(PrimExpr a) { return bitwise_neg(a); } PrimExpr bitwise_neg(PrimExpr a, Span span) { - ICHECK(a.dtype().is_int() || a.dtype().is_uint()); + type_check_integer_args(a, "~ operator (bitwise NOT)"); return tir::Call(a.dtype(), tir::builtin::bitwise_not(), {a}, span); } @@ -728,7 +750,7 @@ PrimExpr sum(PrimExpr source, Array rdom, Array init, Span sp } PrimExpr all(PrimExpr source, Array rdom, Array init, Span span) { - ICHECK(source.dtype().is_bool()); + type_check_boolean_args(source, "tvm::all"); Var x("x", source.dtype(), span), y("y", source.dtype()); PrimExpr result = tir::And(x, y, span); PrimExpr identity_element = make_const(source.dtype(), true, span); @@ -737,7 +759,7 @@ PrimExpr all(PrimExpr source, Array rdom, Array init, Span sp } PrimExpr any(PrimExpr source, Array rdom, Array init, Span span) { - ICHECK(source.dtype().is_bool()); + type_check_boolean_args(source, "tvm::any"); Var x("x", source.dtype(), span), y("y", source.dtype(), span); PrimExpr result = tir::Or(x, y, span); PrimExpr identity_element = make_const(source.dtype(), false, span); From 5287d8f11e28cf4953ca3b5638880397e7ceb48e Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Tue, 30 Aug 2022 11:51:12 -0700 Subject: [PATCH 070/704] [ci] Move non-task CI scripts into ci/ folder (#12609) [CI] Update Hexagon image to install boost (#12613) The new image has xgboost installed, which I need for https://github.com/apache/tvm/pull/12587 Validated in https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/ci-docker-staging/279/pipeline Co-authored-by: masahi --- .github/ISSUE_TEMPLATE/flaky-test.md | 2 +- .github/workflows/cc_bot.yml | 2 +- .github/workflows/docs_bot.yml | 2 +- .github/workflows/nightly_docker_update.yml | 2 +- .github/workflows/ping_reviewers.yml | 2 +- .github/workflows/tag_teams.yml | 2 +- .github/workflows/tests_bot.yml | 2 +- .github/workflows/tvmbot.yml | 2 +- .../update_last_successful_branch.yml | 2 +- Jenkinsfile | 14 ++++++------ ci/README.md | 1 + ci/jenkins/Prepare.groovy.j2 | 12 +++++----- {tests => ci}/scripts/cmd_utils.py | 0 .../scripts/determine_docker_images.py | 0 {tests => ci}/scripts/git_change_docker.sh | 0 {tests => ci}/scripts/git_change_docs.sh | 0 {tests => ci}/scripts/git_skip_ci.py | 0 {tests => ci}/scripts/git_skip_ci_globs.py | 0 {tests => ci}/scripts/git_utils.py | 0 {tests => ci}/scripts/github_cc_reviewers.py | 0 {tests => ci}/scripts/github_docs_comment.py | 0 .../scripts/github_skipped_tests_comment.py | 0 {tests => ci}/scripts/github_tag_teams.py | 0 {tests => ci}/scripts/github_tvmbot.py | 0 {tests => ci}/scripts/http_utils.py | 0 .../scripts/open_docker_update_pr.py | 0 {tests => ci}/scripts/ping_reviewers.py | 0 {tests => ci}/scripts/pytest_ids.py | 0 {tests => ci}/scripts/pytest_wrapper.py | 0 .../scripts/should_rebuild_docker.py | 0 .../scripts/should_run_slow_tests.py | 0 {tests => ci}/scripts/update_branch.py | 0 docker/bash.sh | 2 +- tests/python/ci/test_ci.py | 22 +++++++++---------- tests/python/ci/test_tvmbot.py | 2 +- tests/scripts/setup-pytest-env.sh | 2 +- tests/scripts/task_build.py | 5 +++++ tests/scripts/task_python_frontend.sh | 2 +- 38 files changed, 43 insertions(+), 37 deletions(-) rename {tests => ci}/scripts/cmd_utils.py (100%) rename {tests => ci}/scripts/determine_docker_images.py (100%) rename {tests => ci}/scripts/git_change_docker.sh (100%) rename {tests => ci}/scripts/git_change_docs.sh (100%) rename {tests => ci}/scripts/git_skip_ci.py (100%) rename {tests => ci}/scripts/git_skip_ci_globs.py (100%) rename {tests => ci}/scripts/git_utils.py (100%) rename {tests => ci}/scripts/github_cc_reviewers.py (100%) rename {tests => ci}/scripts/github_docs_comment.py (100%) rename {tests => ci}/scripts/github_skipped_tests_comment.py (100%) rename {tests => ci}/scripts/github_tag_teams.py (100%) rename {tests => ci}/scripts/github_tvmbot.py (100%) rename {tests => ci}/scripts/http_utils.py (100%) rename {tests => ci}/scripts/open_docker_update_pr.py (100%) rename {tests => ci}/scripts/ping_reviewers.py (100%) rename {tests => ci}/scripts/pytest_ids.py (100%) rename {tests => ci}/scripts/pytest_wrapper.py (100%) rename {tests => ci}/scripts/should_rebuild_docker.py (100%) rename {tests => ci}/scripts/should_run_slow_tests.py (100%) rename {tests => ci}/scripts/update_branch.py (100%) diff --git a/.github/ISSUE_TEMPLATE/flaky-test.md b/.github/ISSUE_TEMPLATE/flaky-test.md index 1d61bbb632a4..1e8d267f8ec1 100644 --- a/.github/ISSUE_TEMPLATE/flaky-test.md +++ b/.github/ISSUE_TEMPLATE/flaky-test.md @@ -7,7 +7,7 @@ labels: "test: flaky" Thanks for participating in the TVM community! We use https://discuss.tvm.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking. You are always welcomed to post on the forum first :smile_cat: -These tests were found to be flaky (intermittently failing on `main` or failed in a PR with unrelated changes). As per [the docs](https://github.com/apache/tvm/blob/main/docs/contribute/ci.rst#handling-flaky-failures, these failures will be disabled in a PR that references this issue until the test owners can fix the source of the flakiness. +These tests were found to be flaky (intermittently failing on `main` or failed in a PR with unrelated changes). As per [the docs](https://github.com/apache/tvm/blob/main/docs/contribute/ci.rst#handling-flaky-failures), these failures will be disabled in a PR that references this issue until the test owners can fix the source of the flakiness. ### Test(s) diff --git a/.github/workflows/cc_bot.yml b/.github/workflows/cc_bot.yml index ac0baa490222..95aa96426229 100644 --- a/.github/workflows/cc_bot.yml +++ b/.github/workflows/cc_bot.yml @@ -44,4 +44,4 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set -eux - python tests/scripts/github_cc_reviewers.py || echo step failed + python ci/scripts/github_cc_reviewers.py || echo step failed diff --git a/.github/workflows/docs_bot.yml b/.github/workflows/docs_bot.yml index 9480a1176f15..73c12a8d7d05 100644 --- a/.github/workflows/docs_bot.yml +++ b/.github/workflows/docs_bot.yml @@ -15,4 +15,4 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set -eux - python tests/scripts/github_docs_comment.py \ No newline at end of file + python ci/scripts/github_docs_comment.py \ No newline at end of file diff --git a/.github/workflows/nightly_docker_update.yml b/.github/workflows/nightly_docker_update.yml index 08945555af34..c2441807430f 100644 --- a/.github/workflows/nightly_docker_update.yml +++ b/.github/workflows/nightly_docker_update.yml @@ -28,4 +28,4 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set -eux - python tests/scripts/open_docker_update_pr.py + python ci/scripts/open_docker_update_pr.py diff --git a/.github/workflows/ping_reviewers.yml b/.github/workflows/ping_reviewers.yml index 96c20434d9b5..a2e3e996a033 100644 --- a/.github/workflows/ping_reviewers.yml +++ b/.github/workflows/ping_reviewers.yml @@ -20,4 +20,4 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set -eux - python tests/scripts/ping_reviewers.py --wait-time-minutes 10080 || echo failed + python ci/scripts/ping_reviewers.py --wait-time-minutes 10080 || echo failed diff --git a/.github/workflows/tag_teams.yml b/.github/workflows/tag_teams.yml index 2518cf87db5b..7c10f9c33d9f 100644 --- a/.github/workflows/tag_teams.yml +++ b/.github/workflows/tag_teams.yml @@ -45,4 +45,4 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set -eux - python tests/scripts/github_tag_teams.py || echo failed + python ci/scripts/github_tag_teams.py || echo failed diff --git a/.github/workflows/tests_bot.yml b/.github/workflows/tests_bot.yml index e9d7d81375e4..0ddae2afb771 100644 --- a/.github/workflows/tests_bot.yml +++ b/.github/workflows/tests_bot.yml @@ -18,4 +18,4 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set -eux - python tests/scripts/github_skipped_tests_comment.py \ No newline at end of file + python ci/scripts/github_skipped_tests_comment.py \ No newline at end of file diff --git a/.github/workflows/tvmbot.yml b/.github/workflows/tvmbot.yml index 87292ec211d1..23e90aed5329 100644 --- a/.github/workflows/tvmbot.yml +++ b/.github/workflows/tvmbot.yml @@ -34,4 +34,4 @@ jobs: RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} run: | set -eux - python tests/scripts/github_tvmbot.py --pr "$PR_NUMBER" --run-url "$RUN_URL" --trigger-comment-json "$ISSUE_COMMENT" + python ci/scripts/github_tvmbot.py --pr "$PR_NUMBER" --run-url "$RUN_URL" --trigger-comment-json "$ISSUE_COMMENT" diff --git a/.github/workflows/update_last_successful_branch.yml b/.github/workflows/update_last_successful_branch.yml index fc2f2d0d4f2a..6635b9ef4c47 100644 --- a/.github/workflows/update_last_successful_branch.yml +++ b/.github/workflows/update_last_successful_branch.yml @@ -41,4 +41,4 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set -eux - python tests/scripts/update_branch.py || echo step failed + python ci/scripts/update_branch.py || echo step failed diff --git a/Jenkinsfile b/Jenkinsfile index 3278e83098b7..1b615e38304c 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-08-26T15:09:39.104767 +// Generated at 2022-08-26T15:48:19.597592 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -230,7 +230,7 @@ def should_skip_slow_tests(pr_number) { // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests result = sh ( returnStatus: true, - script: "./tests/scripts/should_run_slow_tests.py --pr '${pr_number}'", + script: "./ci/scripts/should_run_slow_tests.py --pr '${pr_number}'", label: 'Check if CI should run slow tests', ) } @@ -255,7 +255,7 @@ def should_skip_ci(pr_number) { } glob_skip_ci_code = sh ( returnStatus: true, - script: "./tests/scripts/git_skip_ci_globs.py", + script: "./ci/scripts/git_skip_ci_globs.py", label: 'Check if CI should be skipped due to changed files', ) if (glob_skip_ci_code == 0) { @@ -269,7 +269,7 @@ def should_skip_ci(pr_number) { // full CI just in case). Exit code of 0 means skip CI. git_skip_ci_code = sh ( returnStatus: true, - script: "./tests/scripts/git_skip_ci.py --pr '${pr_number}'", + script: "./ci/scripts/git_skip_ci.py --pr '${pr_number}'", label: 'Check if CI should be skipped', ) } @@ -284,7 +284,7 @@ def prepare() { if (env.DETERMINE_DOCKER_IMAGES == 'yes') { sh( - script: "./tests/scripts/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ", + script: "./ci/scripts/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ", label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images', ) // Pull image names from the results of should_rebuild_docker.py @@ -367,14 +367,14 @@ def prepare() { is_docs_only_build = sh ( returnStatus: true, - script: './tests/scripts/git_change_docs.sh', + script: './ci/scripts/git_change_docs.sh', label: 'Check for docs only changes', ) skip_ci = should_skip_ci(env.CHANGE_ID) skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID) rebuild_docker_images = sh ( returnStatus: true, - script: './tests/scripts/git_change_docker.sh', + script: './ci/scripts/git_change_docker.sh', label: 'Check for any docker changes', ) diff --git a/ci/README.md b/ci/README.md index 38995549236c..2cb915e70207 100644 --- a/ci/README.md +++ b/ci/README.md @@ -26,6 +26,7 @@ TVM project in a healthy state and preventing breakages. CI in TVM is broken int - The tests themselves, all of which live underneath [`tests`](../tests). - Definitions of test suites, with each suite defined as a separate `task_` script in [`tests/scripts`](../tests/scripts). + - Scripts and automation [`ci/scripts`](../ci/scripts). - The linux test sequence (in [`Jenkinsfile`](../Jenkinsfile)), which lints and builds TVM and runs test suites using Docker on Linux. - The Windows and Mac test sequences (in [`.github/actions`](../.github/actions)). diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2 index 7115d39ffce3..404d2870c9e2 100644 --- a/ci/jenkins/Prepare.groovy.j2 +++ b/ci/jenkins/Prepare.groovy.j2 @@ -80,7 +80,7 @@ def should_skip_slow_tests(pr_number) { // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests result = sh ( returnStatus: true, - script: "./tests/scripts/should_run_slow_tests.py --pr '${pr_number}'", + script: "./ci/scripts/should_run_slow_tests.py --pr '${pr_number}'", label: 'Check if CI should run slow tests', ) } @@ -105,7 +105,7 @@ def should_skip_ci(pr_number) { } glob_skip_ci_code = sh ( returnStatus: true, - script: "./tests/scripts/git_skip_ci_globs.py", + script: "./ci/scripts/git_skip_ci_globs.py", label: 'Check if CI should be skipped due to changed files', ) if (glob_skip_ci_code == 0) { @@ -119,7 +119,7 @@ def should_skip_ci(pr_number) { // full CI just in case). Exit code of 0 means skip CI. git_skip_ci_code = sh ( returnStatus: true, - script: "./tests/scripts/git_skip_ci.py --pr '${pr_number}'", + script: "./ci/scripts/git_skip_ci.py --pr '${pr_number}'", label: 'Check if CI should be skipped', ) } @@ -134,7 +134,7 @@ def prepare() { if (env.DETERMINE_DOCKER_IMAGES == 'yes') { sh( - script: "./tests/scripts/determine_docker_images.py {% for image in images %}{{ image.name }}={% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %} {% endfor %}", + script: "./ci/scripts/determine_docker_images.py {% for image in images %}{{ image.name }}={% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %} {% endfor %}", label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images', ) // Pull image names from the results of should_rebuild_docker.py @@ -160,14 +160,14 @@ def prepare() { is_docs_only_build = sh ( returnStatus: true, - script: './tests/scripts/git_change_docs.sh', + script: './ci/scripts/git_change_docs.sh', label: 'Check for docs only changes', ) skip_ci = should_skip_ci(env.CHANGE_ID) skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID) rebuild_docker_images = sh ( returnStatus: true, - script: './tests/scripts/git_change_docker.sh', + script: './ci/scripts/git_change_docker.sh', label: 'Check for any docker changes', ) diff --git a/tests/scripts/cmd_utils.py b/ci/scripts/cmd_utils.py similarity index 100% rename from tests/scripts/cmd_utils.py rename to ci/scripts/cmd_utils.py diff --git a/tests/scripts/determine_docker_images.py b/ci/scripts/determine_docker_images.py similarity index 100% rename from tests/scripts/determine_docker_images.py rename to ci/scripts/determine_docker_images.py diff --git a/tests/scripts/git_change_docker.sh b/ci/scripts/git_change_docker.sh similarity index 100% rename from tests/scripts/git_change_docker.sh rename to ci/scripts/git_change_docker.sh diff --git a/tests/scripts/git_change_docs.sh b/ci/scripts/git_change_docs.sh similarity index 100% rename from tests/scripts/git_change_docs.sh rename to ci/scripts/git_change_docs.sh diff --git a/tests/scripts/git_skip_ci.py b/ci/scripts/git_skip_ci.py similarity index 100% rename from tests/scripts/git_skip_ci.py rename to ci/scripts/git_skip_ci.py diff --git a/tests/scripts/git_skip_ci_globs.py b/ci/scripts/git_skip_ci_globs.py similarity index 100% rename from tests/scripts/git_skip_ci_globs.py rename to ci/scripts/git_skip_ci_globs.py diff --git a/tests/scripts/git_utils.py b/ci/scripts/git_utils.py similarity index 100% rename from tests/scripts/git_utils.py rename to ci/scripts/git_utils.py diff --git a/tests/scripts/github_cc_reviewers.py b/ci/scripts/github_cc_reviewers.py similarity index 100% rename from tests/scripts/github_cc_reviewers.py rename to ci/scripts/github_cc_reviewers.py diff --git a/tests/scripts/github_docs_comment.py b/ci/scripts/github_docs_comment.py similarity index 100% rename from tests/scripts/github_docs_comment.py rename to ci/scripts/github_docs_comment.py diff --git a/tests/scripts/github_skipped_tests_comment.py b/ci/scripts/github_skipped_tests_comment.py similarity index 100% rename from tests/scripts/github_skipped_tests_comment.py rename to ci/scripts/github_skipped_tests_comment.py diff --git a/tests/scripts/github_tag_teams.py b/ci/scripts/github_tag_teams.py similarity index 100% rename from tests/scripts/github_tag_teams.py rename to ci/scripts/github_tag_teams.py diff --git a/tests/scripts/github_tvmbot.py b/ci/scripts/github_tvmbot.py similarity index 100% rename from tests/scripts/github_tvmbot.py rename to ci/scripts/github_tvmbot.py diff --git a/tests/scripts/http_utils.py b/ci/scripts/http_utils.py similarity index 100% rename from tests/scripts/http_utils.py rename to ci/scripts/http_utils.py diff --git a/tests/scripts/open_docker_update_pr.py b/ci/scripts/open_docker_update_pr.py similarity index 100% rename from tests/scripts/open_docker_update_pr.py rename to ci/scripts/open_docker_update_pr.py diff --git a/tests/scripts/ping_reviewers.py b/ci/scripts/ping_reviewers.py similarity index 100% rename from tests/scripts/ping_reviewers.py rename to ci/scripts/ping_reviewers.py diff --git a/tests/scripts/pytest_ids.py b/ci/scripts/pytest_ids.py similarity index 100% rename from tests/scripts/pytest_ids.py rename to ci/scripts/pytest_ids.py diff --git a/tests/scripts/pytest_wrapper.py b/ci/scripts/pytest_wrapper.py similarity index 100% rename from tests/scripts/pytest_wrapper.py rename to ci/scripts/pytest_wrapper.py diff --git a/tests/scripts/should_rebuild_docker.py b/ci/scripts/should_rebuild_docker.py similarity index 100% rename from tests/scripts/should_rebuild_docker.py rename to ci/scripts/should_rebuild_docker.py diff --git a/tests/scripts/should_run_slow_tests.py b/ci/scripts/should_run_slow_tests.py similarity index 100% rename from tests/scripts/should_run_slow_tests.py rename to ci/scripts/should_run_slow_tests.py diff --git a/tests/scripts/update_branch.py b/ci/scripts/update_branch.py similarity index 100% rename from tests/scripts/update_branch.py rename to ci/scripts/update_branch.py diff --git a/docker/bash.sh b/docker/bash.sh index 62b71ba3539e..10d80478d3f7 100755 --- a/docker/bash.sh +++ b/docker/bash.sh @@ -295,7 +295,7 @@ if [ -n "${EXPANDED_SHORTCUT}" ]; then if [ "${CI+x}" == "x" ]; then DOCKER_IMAGE_NAME="${EXPANDED_SHORTCUT}" else - python3 tests/scripts/determine_docker_images.py "$DOCKER_IMAGE_NAME=$EXPANDED_SHORTCUT" 2> /dev/null + python3 ci/scripts/determine_docker_images.py "$DOCKER_IMAGE_NAME=$EXPANDED_SHORTCUT" 2> /dev/null DOCKER_IMAGE_NAME=$(cat ".docker-image-names/$DOCKER_IMAGE_NAME") if [[ "$DOCKER_IMAGE_NAME" == *"tlcpackstaging"* ]]; then echo "WARNING: resolved docker image to fallback tag in tlcpackstaging" >&2 diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py index c45a0d8d8ee0..0939aae10ab5 100644 --- a/tests/python/ci/test_ci.py +++ b/tests/python/ci/test_ci.py @@ -172,7 +172,7 @@ def test_skipped_tests_comment( """ Test that a comment with a link to the docs is successfully left on PRs """ - skipped_tests_script = REPO_ROOT / "tests" / "scripts" / "github_skipped_tests_comment.py" + skipped_tests_script = REPO_ROOT / "ci" / "scripts" / "github_skipped_tests_comment.py" def write_xml_file(root_dir, xml_file, xml_content): shutil.rmtree(root_dir, ignore_errors=True) @@ -232,7 +232,7 @@ def test_docs_comment( """ Test that a comment with a link to the docs is successfully left on PRs """ - docs_comment_script = REPO_ROOT / "tests" / "scripts" / "github_docs_comment.py" + docs_comment_script = REPO_ROOT / "ci" / "scripts" / "github_docs_comment.py" git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) git.run("init") @@ -258,7 +258,7 @@ def test_cc_reviewers(tmpdir_factory): """ Test that reviewers are added from 'cc @someone' messages in PRs """ - reviewers_script = REPO_ROOT / "tests" / "scripts" / "github_cc_reviewers.py" + reviewers_script = REPO_ROOT / "ci" / "scripts" / "github_cc_reviewers.py" def run(pr_body, requested_reviewers, existing_review_users, expected_reviewers): git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) @@ -335,7 +335,7 @@ def test_update_branch(tmpdir_factory): """ Test that the last-successful branch script updates successfully """ - update_script = REPO_ROOT / "tests" / "scripts" / "update_branch.py" + update_script = REPO_ROOT / "ci" / "scripts" / "update_branch.py" def run(statuses, expected_rc, expected_output): git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) @@ -515,7 +515,7 @@ def test_skip_ci(tmpdir_factory, commands, should_skip, pr_title, why): """ Test that CI is skipped when it should be """ - skip_ci_script = REPO_ROOT / "tests" / "scripts" / "git_skip_ci.py" + skip_ci_script = REPO_ROOT / "ci" / "scripts" / "git_skip_ci.py" git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) # Jenkins git is too old and doesn't have 'git init --initial-branch' @@ -548,7 +548,7 @@ def test_skip_globs(tmpdir_factory): """ Test that CI is skipped if only certain files are edited """ - script = REPO_ROOT / "tests" / "scripts" / "git_skip_ci_globs.py" + script = REPO_ROOT / "ci" / "scripts" / "git_skip_ci_globs.py" def run(files, should_skip): git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) @@ -587,7 +587,7 @@ def test_ping_reviewers(tmpdir_factory): """ Test that reviewers are messaged after a time period of inactivity """ - reviewers_script = REPO_ROOT / "tests" / "scripts" / "ping_reviewers.py" + reviewers_script = REPO_ROOT / "ci" / "scripts" / "ping_reviewers.py" def run(pull_request, check): git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) @@ -744,7 +744,7 @@ def test_github_tag_teams(tmpdir_factory): """ Check that individuals are tagged from team headers """ - tag_script = REPO_ROOT / "tests" / "scripts" / "github_tag_teams.py" + tag_script = REPO_ROOT / "ci" / "scripts" / "github_tag_teams.py" def run(source_type, data, check): git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) @@ -1081,7 +1081,7 @@ def test_open_docker_update_pr( tmpdir_factory, tlcpackstaging_body, tlcpack_body, expected, expected_images ): """Test workflow to open a PR to update Docker images""" - tag_script = REPO_ROOT / "tests" / "scripts" / "open_docker_update_pr.py" + tag_script = REPO_ROOT / "ci" / "scripts" / "open_docker_update_pr.py" git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) git.run("init") @@ -1152,7 +1152,7 @@ def test_open_docker_update_pr( ) def test_determine_docker_images(tmpdir_factory, images, expected): """Test script to decide whether to use tlcpack or tlcpackstaging for images""" - tag_script = REPO_ROOT / "tests" / "scripts" / "determine_docker_images.py" + tag_script = REPO_ROOT / "ci" / "scripts" / "determine_docker_images.py" git_dir = tmpdir_factory.mktemp("tmp_git_dir") @@ -1219,7 +1219,7 @@ def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expec """ Check that the Docker images are built when necessary """ - tag_script = REPO_ROOT / "tests" / "scripts" / "should_rebuild_docker.py" + tag_script = REPO_ROOT / "ci" / "scripts" / "should_rebuild_docker.py" git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) git.run("init") diff --git a/tests/python/ci/test_tvmbot.py b/tests/python/ci/test_tvmbot.py index 9568a0469bb0..2c7a0eaec0d4 100644 --- a/tests/python/ci/test_tvmbot.py +++ b/tests/python/ci/test_tvmbot.py @@ -147,7 +147,7 @@ def test_tvmbot(tmpdir_factory, number, filename, expected, comment, user, detai """ Test the mergebot test cases """ - mergebot_script = REPO_ROOT / "tests" / "scripts" / "github_tvmbot.py" + mergebot_script = REPO_ROOT / "ci" / "scripts" / "github_tvmbot.py" test_json_dir = Path(__file__).resolve().parent / "sample_prs" git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh index d6c49a42819a..895979293122 100755 --- a/tests/scripts/setup-pytest-env.sh +++ b/tests/scripts/setup-pytest-env.sh @@ -39,7 +39,7 @@ function cleanup() { set +x if [ "${#pytest_errors[@]}" -gt 0 ]; then echo "These pytest invocations failed, the results can be found in the Jenkins 'Tests' tab or by scrolling up through the raw logs here." - python3 tests/scripts/pytest_wrapper.py "${pytest_errors[@]}" + python3 ci/scripts/pytest_wrapper.py "${pytest_errors[@]}" exit 1 fi set -x diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py index e4583fe6af04..1a8a1d112fc0 100755 --- a/tests/scripts/task_build.py +++ b/tests/scripts/task_build.py @@ -19,9 +19,14 @@ import shutil import os import logging +import sys import multiprocessing from pathlib import Path + +# Hackery to enable importing of utils from ci/scripts +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +sys.path.append(str(REPO_ROOT / "ci" / "scripts")) from cmd_utils import Sh, init_log, REPO_ROOT diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh index 2c7e34fac592..61d7238a594b 100755 --- a/tests/scripts/task_python_frontend.sh +++ b/tests/scripts/task_python_frontend.sh @@ -42,7 +42,7 @@ run_pytest cython python-frontend-pytorch tests/python/frontend/pytorch echo "Running relay Tensorflow frontend test..." # Note: Tensorflow tests often have memory issues, so invoke each one separately -TENSORFLOW_TESTS=$(./tests/scripts/pytest_ids.py --folder tests/python/frontend/tensorflow) +TENSORFLOW_TESTS=$(./ci/scripts/pytest_ids.py --folder tests/python/frontend/tensorflow) i=0 for node_id in $TENSORFLOW_TESTS; do echo "$node_id" From 58ee935a53893bfd47b9cd7ea4738ecec8d7181e Mon Sep 17 00:00:00 2001 From: Yuanjing Shi Date: Tue, 30 Aug 2022 08:51:53 -1000 Subject: [PATCH 071/704] [TVMScript] support float inf, -inf and nan in TVMScript parser and printer (#12618) * support float inf, -inf and nan in TVMScript parser and printer * address comment and fix lint * use type_extensions.Literal * address comments * fix win build * remove template --- python/tvm/script/tir/__init__.pyi | 18 +++++++-- python/tvm/script/tir/intrin.py | 4 ++ src/printer/tvmscript_printer.cc | 37 +++++++++++++++---- .../unittest/test_tvmscript_roundtrip.py | 22 +++++++++++ 4 files changed, 69 insertions(+), 12 deletions(-) diff --git a/python/tvm/script/tir/__init__.pyi b/python/tvm/script/tir/__init__.pyi index a62fb102bec5..a64eed055ae8 100644 --- a/python/tvm/script/tir/__init__.pyi +++ b/python/tvm/script/tir/__init__.pyi @@ -464,14 +464,24 @@ class uint32(PrimExpr): class uint64(PrimExpr): def __init__(self: uint64, imm: Union[PrimExpr, int]): ... +# use typing.Literal instead for python 3.8 or higher +import sys + +if sys.version_info >= (3, 8): + from typing import Literal + + SpecialFloatLiteral = Literal["inf", "-inf", "nan"] +else: + SpecialFloatLiteral = str + class float8(PrimExpr): - def __init__(self: float8, imm: Union[PrimExpr, int, float]): ... + def __init__(self: float8, imm: Union[PrimExpr, int, float, SpecialFloatLiteral]): ... class float16(PrimExpr): - def __init__(self: float16, imm: Union[PrimExpr, int, float]): ... + def __init__(self: float16, imm: Union[PrimExpr, int, float, SpecialFloatLiteral]): ... class float32(PrimExpr): - def __init__(self: float32, imm: Union[PrimExpr, int, float]): ... + def __init__(self: float32, imm: Union[PrimExpr, int, float, SpecialFloatLiteral]): ... class float64(PrimExpr): - def __init__(self: float64, imm: Union[PrimExpr, int, float]): ... + def __init__(self: float64, imm: Union[PrimExpr, int, float, SpecialFloatLiteral]): ... diff --git a/python/tvm/script/tir/intrin.py b/python/tvm/script/tir/intrin.py index 382431c2296a..f3919afe5a24 100644 --- a/python/tvm/script/tir/intrin.py +++ b/python/tvm/script/tir/intrin.py @@ -20,6 +20,7 @@ from typing import List, Any import tvm.tir +from tvm.tir import FloatImm from ..registry import register from ...target import codegen from ..utils import get_param_list, tvm_span_from_synr @@ -51,6 +52,9 @@ def bool(imm, span): # nest closures so we copy the name string def wrap(name): def f(imm, span): + if name.startswith("float"): + if imm in {"inf", "-inf", "nan"}: + return FloatImm(dtype=name, value=float(imm), span=span) return imm.astype(name, span) f.__name__ = name diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc index 7649b6101919..f5300e1e6985 100644 --- a/src/printer/tvmscript_printer.cc +++ b/src/printer/tvmscript_printer.cc @@ -381,18 +381,16 @@ class TVMScriptPrinter : public StmtFunctor, } /*! - * \brief special method to print out const scalar + * \brief special method to print out const int64_t scalar * \param dtype The data type * \param data The pointer to hold the data. */ - template - Doc PrintConstScalar(DataType dtype, const T* data) const { + Doc PrintConstScalar(DataType dtype, const int64_t* data) const { Doc doc; std::ostringstream os; - if (dtype.is_float() || dtype.is_float16() || dtype.is_bfloat16()) { - os.precision(17); - } + os << data[0]; + if (dtype == DataType::Int(32)) { doc << Doc::Text(os.str()); } else if (dtype == DataType::Bool()) { @@ -404,6 +402,29 @@ class TVMScriptPrinter : public StmtFunctor, return doc; } + /*! + * \brief special method to print out const double scalar + * \param dtype The data type + * \param data The pointer to hold the data. + * \note this overriden function is created as std::isnan of msvc will complain about int64_t + */ + Doc PrintConstScalar(DataType dtype, const double* data) const { + Doc doc; + std::ostringstream os; + + os.precision(17); + if (std::isinf(data[0]) || std::isnan(data[0])) { + os << "\"" << data[0] << "\""; + } else { + os << data[0]; + } + + doc << tir_prefix_ << "." << runtime::DLDataType2String(dtype) << "(" << Doc::Text(os.str()) + << ")"; + + return doc; + } + public: static Doc PrintHeader(const std::string& tir_prefix) { Doc header; @@ -731,12 +752,12 @@ Doc TVMScriptPrinter::VisitStmtDefault_(const Object* op) { Doc TVMScriptPrinter::VisitExpr_(const IntImmNode* op, ExprPrecedence* out_precedence) { *out_precedence = ExprPrecedence::kIdentity; - return PrintConstScalar(op->dtype, &(op->value)); + return PrintConstScalar(op->dtype, &(op->value)); } Doc TVMScriptPrinter::VisitExpr_(const FloatImmNode* op, ExprPrecedence* out_precedence) { *out_precedence = ExprPrecedence::kIdentity; - return PrintConstScalar(op->dtype, &(op->value)); + return PrintConstScalar(op->dtype, &(op->value)); } Doc TVMScriptPrinter::VisitExpr_(const StringImmNode* op, ExprPrecedence* out_precedence) { diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py index e98f5057d8c4..45ea88f829ec 100644 --- a/tests/python/unittest/test_tvmscript_roundtrip.py +++ b/tests/python/unittest/test_tvmscript_roundtrip.py @@ -3313,6 +3313,27 @@ def func(A: T.Buffer[(16, 16), "float32"], B: T.Buffer[(16, 16), "float32"]) -> return func +def float_infinity(): + @T.prim_func + def func( + placeholder: T.Buffer[(1, 512, 768), "float32"], T_isinf: T.Buffer[(1, 512, 768), "bool"] + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + for i0, i1, i2 in T.grid(1, 512, 768): + with T.block("T_isinf"): + ax0, ax1, ax2 = T.axis.remap("SSS", [i0, i1, i2]) + T.reads(placeholder[ax0, ax1, ax2]) + T.writes(T_isinf[ax0, ax1, ax2]) + T_isinf[ax0, ax1, ax2] = T.fabs( + placeholder[ax0, ax1, ax2], dtype="float32" + ) == T.float32("inf") and not (T.isnan(placeholder[ax0, ax1, ax2], dtype="bool")) + + return func + + ir_generator = tvm.testing.parameter( opt_gemm_normalize, opt_gemm_lower, @@ -3353,6 +3374,7 @@ def func(A: T.Buffer[(16, 16), "float32"], B: T.Buffer[(16, 16), "float32"]) -> let_expression, void_ptr, decl_buffer, + float_infinity, ) From b44f1343a10ccc908de5e65b864012c72d564a7b Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Tue, 30 Aug 2022 12:48:43 -0700 Subject: [PATCH 072/704] [microTVM][ARM-DSP] Fix pool schedule (#12653) When I built keyword spotting ONNX model, there was an issue with the pool schedule because certain schedules like broadcast and elemwise do not have input tensors. --- python/tvm/topi/arm_cpu/mprofile/dsp/pool.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py index 99470a28530a..441683112447 100644 --- a/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py @@ -105,8 +105,8 @@ def pool_dsp_schedule(outs, layout): s = te.create_schedule([x.op for x in outs]) def _callback(op): - in_dtype = op.input_tensors[0].dtype if "pool_max" in op.tag: + in_dtype = op.input_tensors[0].dtype if in_dtype != "int8": logger.warning("Does not have micro-kernel for %s maxpool.", in_dtype) elif layout == "NWC": @@ -114,6 +114,7 @@ def _callback(op): elif layout == "NHWC": schedule_maxpool_2d_nhwc(s, op) elif "pool_sum" in op.tag: + in_dtype = op.input_tensors[0].dtype if in_dtype != "int16": logger.warning("Does not have micro-kernel for %s avgpool.", in_dtype) elif layout == "NCW": From d421e32f1a3be11a908f897118deee018e309d97 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Tue, 30 Aug 2022 13:40:18 -0700 Subject: [PATCH 073/704] [microTVM]Fix test util functions (#12641) * Fix test utils * Update python/tvm/micro/testing/utils.py Co-authored-by: driazati <9407960+driazati@users.noreply.github.com> --- python/tvm/micro/testing/evaluation.py | 4 +++- python/tvm/micro/testing/utils.py | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/python/tvm/micro/testing/evaluation.py b/python/tvm/micro/testing/evaluation.py index 32de1d2a370d..c8a90ff5b40f 100644 --- a/python/tvm/micro/testing/evaluation.py +++ b/python/tvm/micro/testing/evaluation.py @@ -22,6 +22,7 @@ """ +import logging from io import StringIO from pathlib import Path from contextlib import ExitStack @@ -151,7 +152,8 @@ def predict_labels_aot(session, aot_executor, input_data, runs_per_sample=1): assert aot_executor.get_num_outputs() == 1 assert runs_per_sample > 0 - for sample in input_data: + for counter, sample in enumerate(input_data): + logging.info("Evaluating sample %d", counter) aot_executor.get_input(0).copyfrom(sample) result = aot_executor.module.time_evaluator("run", session.device, number=runs_per_sample)() predicted_label = aot_executor.get_output(0).numpy().argmax() diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py index 794f443e47a6..097fbf283a58 100644 --- a/python/tvm/micro/testing/utils.py +++ b/python/tvm/micro/testing/utils.py @@ -45,12 +45,12 @@ def get_supported_boards(platform: str): return json.load(f) -def get_target(platform: str, board: str): - """Intentionally simple function for making target strings for microcontrollers. +def get_target(platform: str, board: str) -> tvm.target.Target: + """Intentionally simple function for making Targets for microcontrollers. If you need more complex arguments, one should call target.micro directly. Note that almost all, but not all, supported microcontrollers are Arm-based.""" model = get_supported_boards(platform)[board]["model"] - return str(tvm.target.target.micro(model, options=["-device=arm_cpu"])) + return tvm.target.target.micro(model, options=["-device=arm_cpu"]) def check_tune_log(log_path: Union[Path, str]): From 1c32798a2c21fd0db6b0e8c938abee4666163bbd Mon Sep 17 00:00:00 2001 From: Adam Straw Date: Tue, 30 Aug 2022 13:56:04 -0700 Subject: [PATCH 074/704] [Hexagon] Expose gtest output through runtime exception (#12502) Expose Hexagon gtest output in CI by raising it as a runtime exception rather than printing it to stdout. --- tests/python/contrib/test_hexagon/test_run_unit_tests.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/python/contrib/test_hexagon/test_run_unit_tests.py b/tests/python/contrib/test_hexagon/test_run_unit_tests.py index fd75775a0115..24c9f33a8ecb 100644 --- a/tests/python/contrib/test_hexagon/test_run_unit_tests.py +++ b/tests/python/contrib/test_hexagon/test_run_unit_tests.py @@ -17,8 +17,6 @@ """ capture gtest output and return over FFI """ -import numpy as np - import tvm from tvm.contrib.hexagon.session import Session @@ -46,4 +44,7 @@ def test_run_unit_tests(hexagon_session: Session, gtest_args): gtest_error_code = int(gtest_error_code_and_output.splitlines()[0]) gtest_output = gtest_error_code_and_output.split("\n", 1)[-1] print(gtest_output) - np.testing.assert_equal(gtest_error_code, 0) + if gtest_error_code != 0: + raise RuntimeError( + f"Hexagon gtest retruned non-zero error code = {gtest_error_code}:\n{gtest_output}" + ) From 775520c8f3dede1d2b3fb0d34e80ff874b35a99b Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Tue, 30 Aug 2022 15:10:54 -0700 Subject: [PATCH 075/704] [microTVM][Zephyr] Add missing CMSIS-NN source files to cmake file (#12642) This PR adds missing CMSIS-NN source files to Zephyr cmake template file for models like keyword spotting, anomaly detection, VWW and image classification. --- .../zephyr/template_project/CMakeLists.txt.template | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template index b5182bf8ac1f..bbd975315e88 100644 --- a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template +++ b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template @@ -32,11 +32,15 @@ if(${ENABLE_CMSIS}) set(CMSIS_PATH ) file(GLOB_RECURSE cmsis_lib_srcs - ${CMSIS_PATH}/CMSIS/NN/Source/SoftmaxFunctions/*.c + ${CMSIS_PATH}/CMSIS/NN/Source/ActivationFunctions/*.c + ${CMSIS_PATH}/CMSIS/NN/Source/BasicMathFunctions/*.c + ${CMSIS_PATH}/CMSIS/NN/Source/ConcatenationFunctions/*.c ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/*.c ${CMSIS_PATH}/CMSIS/NN/Source/FullyConnectedFunctions/*.c ${CMSIS_PATH}/CMSIS/NN/Source/NNSupportFunctions/*.c ${CMSIS_PATH}/CMSIS/NN/Source/PoolingFunctions/*.c + ${CMSIS_PATH}/CMSIS/NN/Source/ReshapeFunctions/*.c + ${CMSIS_PATH}/CMSIS/NN/Source/SoftmaxFunctions/*.c ) set(cmsis_includes From caf326fab2963dac8fe03d266ea33d323f4b4470 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Tue, 30 Aug 2022 15:19:27 -0700 Subject: [PATCH 076/704] [ci] Add mechanism for trust on certain CI scripts (#12604) This makes it so changes to certain files from users not listed in `CONTRIBUTING.md` are not tested in CI. This is necessary since these scripts run on the baremetal EC2 instances and not inside Docker containers, so they can affect other builds and potentially grab Jenkins secrets. This checks out the version from the upstream for the listed files after running `git checkout`. Tested in CI: [positive](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/PR-12604/6/pipeline/) and [negative](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/PR-12604/9/pipeline/) --- Jenkinsfile | 27 ++++++++++++++++++++++++++- ci/jenkins/Prepare.groovy.j2 | 25 +++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 1b615e38304c..50eee01fa974 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-08-26T15:48:19.597592 +// Generated at 2022-08-30T11:58:06.036509 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -169,6 +169,7 @@ def init_git() { """, label: 'Update git submodules', ) + checkout_trusted_files() } def docker_init(image) { @@ -248,6 +249,30 @@ def cancel_previous_build() { } } +def checkout_trusted_files() { + // trust everything from branch builds + if (!env.BRANCH_NAME.startsWith('PR-')) { + return; + } + + // trust peoople listed in CONTRIBUTING.md + grep_code = sh( + returnStatus: true, + script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'", + label: 'Check if change is from a contributor', + ) + + if (grep_code == 1) { + // Any scripts that run on the bare host and not inside a Docker container + // (especially those that access secrets) should be checked out here so + // only trusted versions are used in CI + sh( + script: "git checkout ${upstream_revision} ci/scripts/.", + label: 'Check out trusted files', + ) + } +} + def should_skip_ci(pr_number) { if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) { // never skip CI on build sourced from a branch diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2 index 404d2870c9e2..94575a7b4b64 100644 --- a/ci/jenkins/Prepare.groovy.j2 +++ b/ci/jenkins/Prepare.groovy.j2 @@ -38,6 +38,7 @@ def init_git() { """, label: 'Update git submodules', ) + checkout_trusted_files() } def docker_init(image) { @@ -98,6 +99,30 @@ def cancel_previous_build() { } } +def checkout_trusted_files() { + // trust everything from branch builds + if (!env.BRANCH_NAME.startsWith('PR-')) { + return; + } + + // trust peoople listed in CONTRIBUTING.md + grep_code = sh( + returnStatus: true, + script: "git show '${upstream_revision}:CONTRIBUTORS.md' | grep '@${env.CHANGE_AUTHOR}'", + label: 'Check if change is from a contributor', + ) + + if (grep_code == 1) { + // Any scripts that run on the bare host and not inside a Docker container + // (especially those that access secrets) should be checked out here so + // only trusted versions are used in CI + sh( + script: "git checkout ${upstream_revision} ci/scripts/.", + label: 'Check out trusted files', + ) + } +} + def should_skip_ci(pr_number) { if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) { // never skip CI on build sourced from a branch From f7cc992a9812872396bf5d42cc70461c3bd7e81f Mon Sep 17 00:00:00 2001 From: Xiyou Zhou Date: Tue, 30 Aug 2022 20:09:15 -0700 Subject: [PATCH 077/704] [MetaSchedule] Complete NCHW Conv2D Winograd Kernel Scheduling (#12648) * Complete winograd scheduling. * Fix test. --- python/tvm/topi/cuda/conv2d_winograd.py | 1 + src/meta_schedule/schedule_rule/winograd.cc | 29 ++++++++++++++++++- .../unittest/test_meta_schedule_space_cuda.py | 13 +++++++-- 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/python/tvm/topi/cuda/conv2d_winograd.py b/python/tvm/topi/cuda/conv2d_winograd.py index f5e6cd88a5e3..239d05844b40 100644 --- a/python/tvm/topi/cuda/conv2d_winograd.py +++ b/python/tvm/topi/cuda/conv2d_winograd.py @@ -104,6 +104,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_ kernel[co][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw] ), name="kernel_pack", + attrs={"schedule_rule": "meta_schedule.winograd_kernel_pack.nchw.cuda"}, ) else: kernel_pack = kernel diff --git a/src/meta_schedule/schedule_rule/winograd.cc b/src/meta_schedule/schedule_rule/winograd.cc index 8ae8118731dd..22e2300d63b6 100644 --- a/src/meta_schedule/schedule_rule/winograd.cc +++ b/src/meta_schedule/schedule_rule/winograd.cc @@ -185,6 +185,32 @@ TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse.nchw.cuda") return {sch}; }); +TVM_REGISTER_GLOBAL("meta_schedule.winograd_kernel_pack.nchw.cuda") + .set_body_typed([](Schedule sch, BlockRV kernel_pack) -> Array { + Array loops = sch->GetLoops(kernel_pack); + ICHECK_EQ(loops.size(), 6); + if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[0]))) { + if (*i <= 16) { + sch->Unroll(loops[0]); + } + } + if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[1]))) { + if (*i <= 16) { + sch->Unroll(loops[1]); + } + } + sch->Unroll(loops[4]); + sch->Unroll(loops[5]); + + LoopRV fused = sch->Fuse({loops[2], loops[3]}); + + int64_t max_threadblocks = 256; + int64_t max_threads_per_block = 1024; + auto get_factor = MakeFactorSampler(sch, {32, 64, 128, 256, 512, 1024}); + BindBlockThreadIdx(sch, kernel_pack, max_threadblocks, max_threads_per_block, get_factor); + return {sch}; + }); + TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.cuda") .set_body_typed([](Schedule sch, BlockRV data_pack) -> Array { BlockRV input_tile = GetOnlyProducer(sch, data_pack); @@ -206,9 +232,10 @@ TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.nchw.cuda") BlockRV data_pad = GetOnlyProducer(sch, input_tile); BlockRV data_l = sch->CacheWrite(data_pack, /*buffer_index=*/0, /*storage_scope=*/"local"); + BlockRV d = sch->CacheRead(data_pack, /*buffer_index=*/0, /*storage_scope=*/"local"); LoopRV loop = ScheduleDataPackNCHW(sch, data_pack); sch->ReverseComputeAt(data_l, loop, /*preserve_unit_loops=*/true); - sch->ComputeAt(input_tile, /*loop_rv=*/loop, /*preserve_unit_loops=*/true); + sch->ComputeAt(d, /*loop_rv=*/loop, /*preserve_unit_loops=*/true); sch->ComputeInline(data_pad); int64_t max_threadblocks = 256; diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py index ce333887ec83..ffa2b57ba8ec 100644 --- a/tests/python/unittest/test_meta_schedule_space_cuda.py +++ b/tests/python/unittest/test_meta_schedule_space_cuda.py @@ -1338,11 +1338,20 @@ def winograd_nchw_conv2d(data: T.Buffer[(1, 64, 224, 224), "float32"], kernel: T bgemm = T.alloc_buffer([6, 6, 64, 3136], dtype="float32") inverse_local = T.alloc_buffer([64, 3136, 4, 4], dtype="float32", scope="local") data_pack_local = T.alloc_buffer([6, 6, 64, 3136], dtype="float32", scope="local") + d_local = T.alloc_buffer([64, 3136, 6, 6], dtype="float32", scope="local") bgemm_local = T.alloc_buffer([6, 6, 64, 3136], dtype="float32", scope="local") kernel_shared = T.alloc_buffer([6, 6, 64, 64], dtype="float32", scope="shared") data_pack_shared = T.alloc_buffer([6, 6, 64, 3136], dtype="float32", scope="shared") for i2_i3_0_fused_i3_1_fused_0 in T.thread_binding(3136, thread="blockIdx.x"): for i2_i3_0_fused_i3_1_fused_1 in T.thread_binding(64, thread="threadIdx.x"): + for ax0, ax1, ax2, ax3 in T.grid(1, 1, 6, 6): + with T.block("d_local"): + v0 = T.axis.spatial(64, (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) // 3136 + ax0) + v1 = T.axis.spatial(3136, (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) % 3136 // 7 * 7 + (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) % 7 + ax1) + v2, v3 = T.axis.remap("SS", [ax2, ax3]) + T.reads(data[v1 // 3136, v0, v1 % 3136 // 56 * 4 + v2 - 1, v1 % 56 * 4 + v3 - 1]) + T.writes(d_local[v0, v1, v2, v3]) + d_local[v0, v1, v2, v3] = T.if_then_else(1 <= v1 % 3136 // 56 * 4 + v2 and v1 % 3136 // 56 * 4 + v2 < 225 and 1 <= v1 % 56 * 4 + v3 and v1 % 56 * 4 + v3 < 225, data[v1 // 3136, v0, v1 % 3136 // 56 * 4 + v2 - 1, v1 % 56 * 4 + v3 - 1], T.float32(0), dtype="float32") for i0 in T.unroll(6): for i1 in T.unroll(6): for i4 in T.unroll(6): @@ -1352,12 +1361,12 @@ def winograd_nchw_conv2d(data: T.Buffer[(1, 64, 224, 224), "float32"], kernel: T ci = T.axis.spatial(64, (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) // 3136) p = T.axis.spatial(3136, (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) % 3136 // 7 * 7 + (i2_i3_0_fused_i3_1_fused_0 * 64 + i2_i3_0_fused_i3_1_fused_1) % 7) r_a, r_a_1 = T.axis.remap("RR", [i4, i5]) - T.reads(data[p // 3136, ci, p % 3136 // 56 * 4 + r_a - 1, p % 56 * 4 + r_a_1 - 1]) + T.reads(d_local[ci, p, r_a, r_a_1]) T.writes(data_pack_local[eps, nu, ci, p]) T.block_attr({"schedule_rule":"meta_schedule.winograd_data_pack.nchw.cuda"}) with T.init(): data_pack_local[eps, nu, ci, p] = T.float32(0) - data_pack_local[eps, nu, ci, p] = data_pack_local[eps, nu, ci, p] + T.if_then_else(1 <= p % 3136 // 56 * 4 + r_a and p % 3136 // 56 * 4 + r_a < 225 and 1 <= p % 56 * 4 + r_a_1 and p % 56 * 4 + r_a_1 < 225, data[p // 3136, ci, p % 3136 // 56 * 4 + r_a - 1, p % 56 * 4 + r_a_1 - 1], T.float32(0), dtype="float32") * T.Select(r_a % 6 == 5 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 5 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 0, T.float32(0), T.Select(r_a % 6 == 4 and eps % 6 == 5, T.float32(1.5), T.Select(r_a % 6 == 4 and eps % 6 == 4, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 3, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 2, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 0, T.float32(1), T.Select(r_a % 6 == 3 and eps % 6 == 5, T.float32(-2), T.Select(r_a % 6 == 3 and eps % 6 == 4, T.float32(-0.5), T.Select(r_a % 6 == 3 and eps % 6 == 3, T.float32(2), T.Select(r_a % 6 == 3 and eps % 6 == 2, T.float32(2.5), T.Select(r_a % 6 == 3 and eps % 6 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and eps % 6 == 0, T.float32(1.5), T.Select(r_a % 6 == 2 and eps % 6 == 5, T.float32(-1.5), T.Select(r_a % 6 == 2 and eps % 6 == 4, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 3, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 2, T.float32(0.5), T.Select(r_a % 6 == 2 and eps % 6 == 1, T.float32(-2.5), T.Select(r_a % 6 == 2 and eps % 6 == 0, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 4, T.float32(0.5), T.Select(r_a % 6 == 1 and eps % 6 == 3, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 2, T.float32(-1), T.Select(r_a % 6 == 1 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 0, T.float32(-1.5), T.Select(r_a % 6 == 0 and eps % 6 == 5, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(r_a_1 % 6 == 5 and nu % 6 == 5, T.float32(1), T.Select(r_a_1 % 6 == 5 and nu % 6 == 4, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 3, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 2, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 1, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 0, T.float32(0), T.Select(r_a_1 % 6 == 4 and nu % 6 == 5, T.float32(1.5), T.Select(r_a_1 % 6 == 4 and nu % 6 == 4, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 3, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 2, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 1, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 0, T.float32(1), T.Select(r_a_1 % 6 == 3 and nu % 6 == 5, T.float32(-2), T.Select(r_a_1 % 6 == 3 and nu % 6 == 4, T.float32(-0.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 3, T.float32(2), T.Select(r_a_1 % 6 == 3 and nu % 6 == 2, T.float32(2.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 1, T.float32(0.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 0, T.float32(1.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 5, T.float32(-1.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 4, T.float32(-1), T.Select(r_a_1 % 6 == 2 and nu % 6 == 3, T.float32(-1), T.Select(r_a_1 % 6 == 2 and nu % 6 == 2, T.float32(0.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 1, T.float32(-2.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 0, T.float32(-2), T.Select(r_a_1 % 6 == 1 and nu % 6 == 5, T.float32(1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 4, T.float32(0.5), T.Select(r_a_1 % 6 == 1 and nu % 6 == 3, T.float32(-2), T.Select(r_a_1 % 6 == 1 and nu % 6 == 2, T.float32(-1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 1, T.float32(1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 0, T.float32(-1.5), T.Select(r_a_1 % 6 == 0 and nu % 6 == 5, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 4, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 3, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 2, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 1, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) + data_pack_local[eps, nu, ci, p] = data_pack_local[eps, nu, ci, p] + d_local[ci, p, r_a, r_a_1] * T.Select(r_a % 6 == 5 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 5 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 5 and eps % 6 == 0, T.float32(0), T.Select(r_a % 6 == 4 and eps % 6 == 5, T.float32(1.5), T.Select(r_a % 6 == 4 and eps % 6 == 4, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 3, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 2, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 4 and eps % 6 == 0, T.float32(1), T.Select(r_a % 6 == 3 and eps % 6 == 5, T.float32(-2), T.Select(r_a % 6 == 3 and eps % 6 == 4, T.float32(-0.5), T.Select(r_a % 6 == 3 and eps % 6 == 3, T.float32(2), T.Select(r_a % 6 == 3 and eps % 6 == 2, T.float32(2.5), T.Select(r_a % 6 == 3 and eps % 6 == 1, T.float32(0.5), T.Select(r_a % 6 == 3 and eps % 6 == 0, T.float32(1.5), T.Select(r_a % 6 == 2 and eps % 6 == 5, T.float32(-1.5), T.Select(r_a % 6 == 2 and eps % 6 == 4, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 3, T.float32(-1), T.Select(r_a % 6 == 2 and eps % 6 == 2, T.float32(0.5), T.Select(r_a % 6 == 2 and eps % 6 == 1, T.float32(-2.5), T.Select(r_a % 6 == 2 and eps % 6 == 0, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 5, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 4, T.float32(0.5), T.Select(r_a % 6 == 1 and eps % 6 == 3, T.float32(-2), T.Select(r_a % 6 == 1 and eps % 6 == 2, T.float32(-1), T.Select(r_a % 6 == 1 and eps % 6 == 1, T.float32(1), T.Select(r_a % 6 == 1 and eps % 6 == 0, T.float32(-1.5), T.Select(r_a % 6 == 0 and eps % 6 == 5, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 4, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 3, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 2, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 1, T.float32(0), T.Select(r_a % 6 == 0 and eps % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) * T.Select(r_a_1 % 6 == 5 and nu % 6 == 5, T.float32(1), T.Select(r_a_1 % 6 == 5 and nu % 6 == 4, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 3, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 2, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 1, T.float32(0), T.Select(r_a_1 % 6 == 5 and nu % 6 == 0, T.float32(0), T.Select(r_a_1 % 6 == 4 and nu % 6 == 5, T.float32(1.5), T.Select(r_a_1 % 6 == 4 and nu % 6 == 4, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 3, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 2, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 1, T.float32(1), T.Select(r_a_1 % 6 == 4 and nu % 6 == 0, T.float32(1), T.Select(r_a_1 % 6 == 3 and nu % 6 == 5, T.float32(-2), T.Select(r_a_1 % 6 == 3 and nu % 6 == 4, T.float32(-0.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 3, T.float32(2), T.Select(r_a_1 % 6 == 3 and nu % 6 == 2, T.float32(2.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 1, T.float32(0.5), T.Select(r_a_1 % 6 == 3 and nu % 6 == 0, T.float32(1.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 5, T.float32(-1.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 4, T.float32(-1), T.Select(r_a_1 % 6 == 2 and nu % 6 == 3, T.float32(-1), T.Select(r_a_1 % 6 == 2 and nu % 6 == 2, T.float32(0.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 1, T.float32(-2.5), T.Select(r_a_1 % 6 == 2 and nu % 6 == 0, T.float32(-2), T.Select(r_a_1 % 6 == 1 and nu % 6 == 5, T.float32(1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 4, T.float32(0.5), T.Select(r_a_1 % 6 == 1 and nu % 6 == 3, T.float32(-2), T.Select(r_a_1 % 6 == 1 and nu % 6 == 2, T.float32(-1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 1, T.float32(1), T.Select(r_a_1 % 6 == 1 and nu % 6 == 0, T.float32(-1.5), T.Select(r_a_1 % 6 == 0 and nu % 6 == 5, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 4, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 3, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 2, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 1, T.float32(0), T.Select(r_a_1 % 6 == 0 and nu % 6 == 0, T.float32(1), T.float32(0))))))))))))))))))))))))))))))))))))) for ax0, ax1, ax2, ax3 in T.grid(6, 6, 1, 1): with T.block("data_pack_local"): v0, v1 = T.axis.remap("SS", [ax0, ax1]) From f114d55bee538b5420c9c993aa789abff245d897 Mon Sep 17 00:00:00 2001 From: wrongtest Date: Wed, 31 Aug 2022 11:12:05 +0800 Subject: [PATCH 078/704] [TIR] Preserve annotations after lower opaque block (#12572) --- src/tir/transforms/lower_opaque_block.cc | 60 ++++++++++++++----- .../test_tir_transform_lower_opaque_block.py | 37 ++++++++++++ 2 files changed, 83 insertions(+), 14 deletions(-) diff --git a/src/tir/transforms/lower_opaque_block.cc b/src/tir/transforms/lower_opaque_block.cc index 69d8787aa1a1..a4655ebbaed5 100644 --- a/src/tir/transforms/lower_opaque_block.cc +++ b/src/tir/transforms/lower_opaque_block.cc @@ -59,6 +59,12 @@ class OpaqueBlockLower : public StmtExprMutator { } body = Allocate(buffer->data, buffer->dtype, new_shape, const_true(), std::move(body)); } + // Step 4. Handle annotations, block annotations are not preserved by default. + std::vector> pragma_attrs; + HandleAnnotations(new_block->annotations, &pragma_attrs, /*is_block=*/true); + for (auto it = pragma_attrs.rbegin(); it != pragma_attrs.rend(); ++it) { + body = AttrStmt(Integer(0), it->first, it->second, std::move(body)); + } return body; } @@ -72,7 +78,11 @@ class OpaqueBlockLower : public StmtExprMutator { } // Step 2. Visit recursively Stmt body = this->VisitStmt(op->body); - // Step 3. Create new For loop accordingly + // Step 3. Handle annotations + std::vector> pragma_attrs; + Map new_annotations = + HandleAnnotations(op->annotations, &pragma_attrs, /*is_block=*/false); + // Step 4. Create new For loop accordingly if (op->kind == ForKind::kThreadBinding) { // Case 1. Thread binding ICHECK(op->thread_binding.defined()); @@ -83,20 +93,12 @@ class OpaqueBlockLower : public StmtExprMutator { return body; } else { // Case 3. An ordinary loop - body = For(op->loop_var, std::move(min), std::move(extent), op->kind, std::move(body)); - } - // Step 4. Handle annotations - std::set ordered_ann_keys; - for (const auto& annotation : op->annotations) { - ordered_ann_keys.insert(annotation.first); + body = For(op->loop_var, std::move(min), std::move(extent), op->kind, std::move(body), + NullOpt, new_annotations); } - for (auto it = ordered_ann_keys.rbegin(); it != ordered_ann_keys.rend(); ++it) { - const std::string& ann_key = *it; - const ObjectRef& ann_value = op->annotations.at(ann_key); - if (attr::IsPragmaKey(ann_key)) { - body = - AttrStmt(op->loop_var, ann_key, ConvertAttrValue(ann_key, ann_value), std::move(body)); - } + // Step 5. Insert nested attrs + for (auto it = pragma_attrs.rbegin(); it != pragma_attrs.rend(); ++it) { + body = AttrStmt(op->loop_var, it->first, it->second, std::move(body)); } return body; } @@ -146,8 +148,38 @@ class OpaqueBlockLower : public StmtExprMutator { } } + /*! + * \brief Helper to handle annotation dict. + * (1) if the attr key is prefixed by `pragma_`, move to ordered kv list. They + * are lowered to `AttrStmt` by legacy TE schedule convention. + * (2) the non-pragma loop annotations are preserved + * (3) the non-pragma block annotations are dropped + * \return New annotation dict with preserved keys. Also update pragma attr pairs ordered by key. + */ + Map HandleAnnotations( + const Map& annotations, + std::vector>* pragma_attrs, bool is_block) { + Map preserved_annotations; + pragma_attrs->clear(); + for (const auto& kv : annotations) { + const String& key = kv.first; + if (attr::IsPragmaKey(key)) { + pragma_attrs->emplace_back(key, ConvertAttrValue(key, kv.second)); + } else if (!is_block) { + // the loop annotation is preserved + preserved_annotations.Set(key, kv.second); + } + } + std::sort(pragma_attrs->begin(), pragma_attrs->end(), + [](const auto& p1, const auto& p2) { return p1.first < p2.first; }); + return preserved_annotations; + } + /*! \brief Record the loop_var and loop start value of unit loops, whose extent is one. */ std::unordered_map unit_loop_vars_; + + /*! \brief Attr keys to preserve into loop annotations. */ + std::unordered_set preserved_annotations_; }; PrimFunc LowerOpaqueBlock(PrimFunc f) { diff --git a/tests/python/unittest/test_tir_transform_lower_opaque_block.py b/tests/python/unittest/test_tir_transform_lower_opaque_block.py index 9b18c407c40c..6f557ba09d43 100644 --- a/tests/python/unittest/test_tir_transform_lower_opaque_block.py +++ b/tests/python/unittest/test_tir_transform_lower_opaque_block.py @@ -321,6 +321,43 @@ def test_annotated_loops(): tvm.ir.assert_structural_equal(attr3.value, tvm.tir.FloatImm("float32", 0.0)) +def test_annotated_block(): + @T.prim_func + def annotated_block() -> None: + with T.block(): + T.block_attr({"pragma_1": "str_value", "pragma_2": 1, "pragma_3": 0.0}) + T.evaluate(0) + + mod = tvm.IRModule.from_expr(annotated_block) + mod = tvm.tir.transform.LowerOpaqueBlock()(mod) + attr1 = mod["main"].body + attr2 = attr1.body + attr3 = attr2.body + assert attr1.attr_key == "pragma_1" and attr1.value == "str_value" + assert attr2.attr_key == "pragma_2" + tvm.ir.assert_structural_equal(attr2.value, tvm.tir.IntImm("int32", 1)) + assert attr3.attr_key == "pragma_3" + tvm.ir.assert_structural_equal(attr3.value, tvm.tir.FloatImm("float32", 0.0)) + + +def test_preserved_annotations(): + @T.prim_func + def before(A: T.Buffer[8, "float32"], B: T.Buffer[8, "float32"]): + for i in T.serial(8, annotations={"k_0": 1, "k_1": [2, 3], "k_2": 3.14}): + with T.block("block"): + T.block_attr({"k_3": "oops"}) + B[i] = A[i] + 1.0 + + @T.prim_func + def after(A: T.Buffer[8, "float32"], B: T.Buffer[8, "float32"]): + for i in T.serial(8, annotations={"k_0": 1, "k_1": [2, 3], "k_2": 3.14}): + B[i] = A[i] + 1.0 + + mod = tvm.IRModule.from_expr(before) + mod = tvm.tir.transform.LowerOpaqueBlock()(mod) + tvm.ir.assert_structural_equal(mod["main"], after) + + def test_boolean_handling(): _check(boolean_handling_before, boolean_handling_after) From c2824a84d51ad3d64be2e72680b33c378f033f99 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Wed, 31 Aug 2022 01:21:36 -0700 Subject: [PATCH 079/704] [Testing] Allow NCHW layout in `relay_workload` (#12656) --- .../tvm/auto_scheduler/testing/tune_relay.py | 6 +++ .../meta_schedule/testing/relay_workload.py | 43 ++++++++++++------- .../tvm/meta_schedule/testing/tune_relay.py | 6 +++ .../unittest/test_meta_schedule_tune_relay.py | 36 +++++++++------- 4 files changed, 60 insertions(+), 31 deletions(-) diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py index 2d84389f9de1..9773fbbc65ad 100644 --- a/python/tvm/auto_scheduler/testing/tune_relay.py +++ b/python/tvm/auto_scheduler/testing/tune_relay.py @@ -73,6 +73,11 @@ def _parse_args(): type=str, required=True, ) + args.add_argument( + "--layout", + type=str, + default=None, + ) args.add_argument( "--cache-dir", type=str, @@ -168,6 +173,7 @@ def main(): mod, params, (input_name, input_shape, input_dtype) = get_network( ARGS.workload, ARGS.input_shape, + layout=ARGS.layout, cache_dir=ARGS.cache_dir, ) input_info = [ diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py index 3cdf251fe4b6..016263489527 100644 --- a/python/tvm/meta_schedule/testing/relay_workload.py +++ b/python/tvm/meta_schedule/testing/relay_workload.py @@ -34,11 +34,12 @@ def _get_network( - args: Tuple[str, List[int]] + args: Tuple[str, List[int], str] ) -> Tuple[IRModule, bytearray, Tuple[str, List[int], str]]: name: str input_shape: List[int] - name, input_shape = args + layout: str + name, input_shape, layout = args mod: IRModule @@ -57,6 +58,8 @@ def _get_network( import torch # type: ignore from torchvision import models # type: ignore + assert layout is None or layout in ["NCHW", "NHWC"] + if name in ["resnet_18", "resnet_50"]: model = getattr(models, name.replace("_", ""))(pretrained=False) elif name == "wide_resnet_50": @@ -86,20 +89,21 @@ def _get_network( input_name = "input0" shape_list = [(input_name, input_shape)] mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) + passes = [relay.transform.RemoveUnusedFunctions()] + if layout == "NHWC": + # PyTorch is imported as NCHW by default + passes.append( + relay.transform.ConvertLayout( + { + "nn.conv2d": ["NHWC", "default"], + "nn.conv3d": ["NDHWC", "default"], + "nn.max_pool2d": ["NHWC", "default"], + "nn.avg_pool2d": ["NHWC", "default"], + } + ) + ) with tvm.transform.PassContext(opt_level=3): - mod = tvm.transform.Sequential( - [ - relay.transform.RemoveUnusedFunctions(), - relay.transform.ConvertLayout( - { - "nn.conv2d": ["NHWC", "default"], - "nn.conv3d": ["NDHWC", "default"], - "nn.max_pool2d": ["NHWC", "default"], - "nn.avg_pool2d": ["NHWC", "default"], - } - ), - ] - )(mod) + mod = tvm.transform.Sequential(passes)(mod) inputs = (input_name, input_shape, dtype) elif name in ["bert_tiny", "bert_base", "bert_medium", "bert_large"]: os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -107,6 +111,8 @@ def _get_network( import torch # type: ignore import transformers # type: ignore + assert layout is None + config_dict = { "bert_tiny": transformers.BertConfig( num_hidden_layers=6, @@ -151,6 +157,8 @@ def _get_network( mod = relay.transform.CombineParallelBatchMatmul()(mod) inputs = (input_name, input_shape, input_dtype) elif name == "dcgan": + assert layout is None + output_shape = input_shape batch_size = output_shape[0] oshape = output_shape[1:] @@ -190,6 +198,7 @@ def get_network( name: str, input_shape: List[int], *, + layout: Optional[str] = None, cache_dir: Optional[str] = None, ) -> Tuple[IRModule, Dict[str, NDArray], Tuple[str, List[int], str]]: """Get the symbol definition and random weight of a network @@ -200,6 +209,8 @@ def get_network( The name of the network. input_shape : List[int] The shape of the input tensor. + layout : Optional[str] + The layout of the input tensor. For vision models, the layout is by default NHWC. cache_dir : Optional[str], optional The directory to cache the generated network. If not specified, the cache will be disabled. @@ -223,7 +234,7 @@ def get_network( cached = _load_cache(cache_dir, filename) if cached is None: with multiprocessing.Pool(processes=1) as pool: - result = pool.map(_get_network, [(name, input_shape)]) + result = pool.map(_get_network, [(name, input_shape, layout)]) ((mod, params_bytearray, inputs),) = result cached = [mod, params_bytearray, inputs] _save_cache(cache_dir, filename, cached) diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py index 596a5a736333..7c5977495db5 100644 --- a/python/tvm/meta_schedule/testing/tune_relay.py +++ b/python/tvm/meta_schedule/testing/tune_relay.py @@ -72,6 +72,11 @@ def _parse_args(): type=str, required=True, ) + args.add_argument( + "--layout", + type=str, + default=None, + ) args.add_argument( "--cache-dir", type=str, @@ -137,6 +142,7 @@ def main(): mod, params, (input_name, input_shape, input_dtype) = get_network( ARGS.workload, ARGS.input_shape, + layout=ARGS.layout, cache_dir=ARGS.cache_dir, ) input_info = [ diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py index b05b57feaf4c..0267352fd697 100644 --- a/tests/python/unittest/test_meta_schedule_tune_relay.py +++ b/tests/python/unittest/test_meta_schedule_tune_relay.py @@ -18,7 +18,7 @@ import logging import tempfile from os import path as osp -from typing import List +from typing import List, Optional import numpy as np # type: ignore import pytest @@ -113,20 +113,21 @@ def main(placeholder: T.Buffer[(1, 2, 16, 16, 4), "float32"], T_layout_trans: T. @pytest.mark.skip("Integration test") @pytest.mark.parametrize( - "model_name, input_shape, target", + "model_name, input_shape, target, layout", [ - ("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16"), - ("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070"), - ("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=16"), - ("mobilenet_v2", [1, 3, 224, 224], "nvidia/geforce-rtx-3070"), - ("bert_base", [1, 64], "llvm --num-cores=16"), - ("bert_base", [1, 64], "nvidia/geforce-rtx-3070"), + ("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16", "NHWC"), + ("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NHWC"), + ("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=16", "NHWC"), + ("mobilenet_v2", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NHWC"), + ("bert_base", [1, 64], "llvm --num-cores=16", None), + ("bert_base", [1, 64], "nvidia/geforce-rtx-3070", None), ], ) def test_meta_schedule_tune_relay( model_name: str, input_shape: List[int], target: str, + layout: Optional[str], ): dev = tvm.cpu() if str(target).startswith("llvm") else tvm.cuda() if model_name.startswith("bert"): @@ -134,7 +135,12 @@ def test_meta_schedule_tune_relay( else: data = tvm.nd.array(np.random.randn(*input_shape).astype("float32"), dev) - mod, params, (input_name, _, _) = get_network(name=model_name, input_shape=input_shape) + mod, params, (input_name, _, _) = get_network( + name=model_name, + input_shape=input_shape, + layout=layout, + ) + target = Target(target) with tempfile.TemporaryDirectory() as work_dir: with ms.Profiler() as profiler: @@ -536,12 +542,12 @@ def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV): if __name__ == """__main__""": - test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16") - test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070") - test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=16") - test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "nvidia/geforce-rtx-3070") - test_meta_schedule_tune_relay("bert_base", [1, 64], "llvm --num-cores=16") - test_meta_schedule_tune_relay("bert_base", [1, 64], "nvidia/geforce-rtx-3070") + test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16", None) + test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NCHW") + test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=16", None) + test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", None) + test_meta_schedule_tune_relay("bert_base", [1, 64], "llvm --num-cores=16", None) + test_meta_schedule_tune_relay("bert_base", [1, 64], "nvidia/geforce-rtx-3070", None) test_meta_schedule_te2primfunc_argument_order() test_meta_schedule_relay_lowering() test_tune_relay_manual_tir_vnni() From acbbd9f15a9ce79ecc88f16f5be9b0c07122cfc4 Mon Sep 17 00:00:00 2001 From: Nicola Lancellotti Date: Wed, 31 Aug 2022 10:54:54 +0100 Subject: [PATCH 080/704] [ETHOSN] Improve inferring new shape of the Reshape operator (#12594) Fixes the case when reshape is > 4 dims. While this cannot be offloaded to the NPU, the check was previously producing an error preventing further compilation. The correct behavior is to ensure the check returns False and not offload the reshape. --- python/tvm/relay/op/contrib/ethosn.py | 2 -- .../backend/contrib/ethosn/ethosn_api.cc | 18 ++++++------- .../contrib/test_ethosn/test_networks.py | 4 +-- .../contrib/test_ethosn/test_reshape.py | 25 ++++++++++++++++--- 4 files changed, 33 insertions(+), 16 deletions(-) diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py index 83972bd08b41..a4e9d9647c95 100644 --- a/python/tvm/relay/op/contrib/ethosn.py +++ b/python/tvm/relay/op/contrib/ethosn.py @@ -360,8 +360,6 @@ def reshape(expr): """Check if a reshape is supported by Ethos-N.""" if not ethosn_available(): return False - if not _is_ethosn_composite(expr.args[0]): - return False return _ethosn.reshape(expr) diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc index ccca1779f6d9..55e8901dae08 100644 --- a/src/relay/backend/contrib/ethosn/ethosn_api.cc +++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc @@ -37,7 +37,6 @@ #include #include -#include "../../../op/tensor/transform.h" #include "ethosn_support_library/Support.hpp" #include "ethosn_support_library/SupportQueries.hpp" #include "tvm/relay/qnn/attrs.h" @@ -300,15 +299,16 @@ EthosnError EthosnAPI::Reshape(const Expr& expr, ReshapeParams* params) { sl::DataType input_data_type; EthosnError err = Tvm2Npu(input_dtype->shape, &input_tensor_shape); err += Tvm2Npu(input_dtype->dtype, &input_data_type); - int tensor_size = 1; - for (const auto& dim : input_tensor_shape) { - tensor_size *= dim; - } - Array inferred_shape = {1, 1, 1, 1}; - Array new_shape = InferNewShape(input_dtype->shape, reshape->attrs, false); - for (size_t i = 0; i < new_shape.size(); ++i) { - inferred_shape.Set(i, new_shape[i]); + Array inferred_shape; + Array new_shape = reshape->checked_type().as()->shape; + if (new_shape.size() < 4) { + inferred_shape = {1, 1, 1, 1}; + for (size_t i = 0; i < new_shape.size(); ++i) { + inferred_shape.Set(i, new_shape[i]); + } + } else { + inferred_shape = new_shape; } err += Tvm2Npu(inferred_shape, ¶ms->new_shape); diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py index d16bf5bf325c..11745409d4ea 100644 --- a/tests/python/contrib/test_ethosn/test_networks.py +++ b/tests/python/contrib/test_ethosn/test_networks.py @@ -151,7 +151,7 @@ def test_resnet_50_int8(): input_dict={"input": (1, 224, 224, 3)}, compile_hash=_compile_hash, output_count=1, - host_ops=11, + host_ops=10, npu_partitions=2, ) @@ -211,6 +211,6 @@ def test_ssd_mobilenet_v1(): input_dict={"normalized_input_image_tensor": (1, 300, 300, 3)}, compile_hash=_compile_hash, output_count=4, - host_ops=28, + host_ops=27, npu_partitions=2, ) diff --git a/tests/python/contrib/test_ethosn/test_reshape.py b/tests/python/contrib/test_ethosn/test_reshape.py index cb8a49be2d81..e165cea9c63b 100644 --- a/tests/python/contrib/test_ethosn/test_reshape.py +++ b/tests/python/contrib/test_ethosn/test_reshape.py @@ -28,9 +28,8 @@ def _get_model(input_shape, output_shape, dtype): """Return a model and any parameters it may have""" a = relay.var("a", shape=input_shape, dtype=dtype) - conv, params = tei.get_conv2d(a, input_shape, dtype) - req = relay.reshape(conv, output_shape) - return req, params + req = relay.reshape(a, output_shape) + return req, {} @requires_ethosn @@ -53,6 +52,8 @@ def _get_model(input_shape, output_shape, dtype): ], ) def test_reshape(dtype, input_shape, output_shape): + """Compare Reshape output with TVM.""" + np.random.seed(0) inputs = { "a": tvm.nd.array( @@ -71,3 +72,21 @@ def test_reshape(dtype, input_shape, output_shape): outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu)) tei.verify(outputs, dtype, 1) + + +@requires_ethosn +@pytest.mark.parametrize( + "input_shape, output_shape", + [ + ( + (1, 13, 13, 255), + (1, 13, 13, 3, 85), + ), + ], +) +def test_reshape_failure(input_shape, output_shape): + """Check Resize is not offloaded.""" + + model, params = _get_model(input_shape, output_shape, "int8") + mod = tei.make_module(model, params) + tei.build(mod, params, expected_host_ops=1, npu_partitions=0) From 0c374544a3e1dc358b23e99eaff719631a9984d7 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Wed, 31 Aug 2022 09:41:41 -0700 Subject: [PATCH 081/704] [TIR][TVMScript] Update printer / parser to make T.allocate return buffer var (#12412) * Updated TVMScript syntax of `T.allocate` to return buffer var. * Added syntax sugar for `T.decl_buffer`. When `data` field is not specified, `data` will be implicitly created via `Allocate` stmt. * Updated the existing test cases. Most test cases can be updated by changing `T.allocate` to `T.decl_buffer`. `T.allocate` in some tests are updated to `T.allocate` + `T.buffer_decl`, to maintain the legacy behavior of allocation and implicit buffer declaration (will be followed up in future PR to adopt `T.decl_buffer`). --- python/tvm/script/tir/scope_handler.py | 57 ++--- src/printer/tvmscript_printer.cc | 128 +++++----- .../test_copy_compute_reordering.py | 228 +++++++++--------- .../test_ethosu/test_encode_constants.py | 48 ++-- .../test_ethosu/test_hoist_allocates.py | 75 ++++-- .../test_ethosu/test_merge_constants.py | 158 +++++++----- .../test_ethosu/test_remove_concatenates.py | 3 +- .../test_ethosu/test_replace_conv2d.py | 18 +- .../contrib/test_ethosu/test_replace_copy.py | 9 +- .../contrib/test_ethosu/test_scheduler.py | 12 +- .../test_ethosu/test_tir_to_cs_translator.py | 31 ++- ..._meta_schedule_postproc_verify_gpu_code.py | 24 +- .../test_tir_analysis_calculate_workspace.py | 18 +- ...t_tir_analysis_detect_buffer_access_lca.py | 2 +- tests/python/unittest/test_tir_ptx_mma.py | 103 ++++---- tests/python/unittest/test_tir_ptx_mma_sp.py | 32 +-- tests/python/unittest/test_tir_renew_defs.py | 3 +- .../test_tir_structural_equal_hash.py | 2 +- ..._tir_transform_convert_for_loops_serial.py | 4 +- .../test_tir_transform_extract_constants.py | 9 +- .../test_tir_transform_flatten_buffer.py | 36 ++- ...est_tir_transform_inject_virtual_thread.py | 12 +- .../test_tir_transform_lower_opaque_block.py | 18 +- ...tir_transform_renormalize_split_pattern.py | 18 +- .../test_tir_transform_storage_flatten.py | 4 +- .../test_tir_transform_storage_rewrite.py | 6 +- .../test_tir_transform_unroll_loop.py | 9 +- tests/python/unittest/test_tir_usmp_algo.py | 38 +-- ...st_tir_usmp_analysis_extract_bufferinfo.py | 146 +++++------ ...orm_convert_pool_allocations_to_offsets.py | 49 ++-- tests/python/unittest/test_tir_usmp_utils.py | 12 +- .../unittest/test_tvmscript_roundtrip.py | 82 +++++-- 32 files changed, 804 insertions(+), 590 deletions(-) diff --git a/python/tvm/script/tir/scope_handler.py b/python/tvm/script/tir/scope_handler.py index 41fa6a5fa2f7..1d2550eecde2 100644 --- a/python/tvm/script/tir/scope_handler.py +++ b/python/tvm/script/tir/scope_handler.py @@ -112,9 +112,9 @@ def allocate(extents, dtype, scope, condition=True, annotations=None, span=None) scope = tvm.runtime.convert(scope) return tvm.tir.Allocate( - self.buffer.data, - self.buffer.dtype, - self.buffer.shape, + self.buffer_var, + dtype, + extents, condition, self.body, annotations=annotations, @@ -122,7 +122,7 @@ def allocate(extents, dtype, scope, condition=True, annotations=None, span=None) ) super().__init__(allocate, concise_scope=True, def_symbol=True) - self.buffer = None + self.buffer_var = None def enter_scope( self, @@ -146,20 +146,15 @@ def enter_scope( else: raise Exception("Internal Bug") - def setup_buffer( + def setup_buffer_var( extents, dtype, scope, condition=True, annotations=None, span: Span = None ): - """Setup buffer object for a given type.""" - self.buffer = tvm.tir.decl_buffer( - shape=extents, - dtype=dtype, - name=name, - scope=scope, - span=span, - ) + """Setup buffer var for a given type.""" + buffer_ptr_type = tvm.ir.PointerType(tvm.ir.PrimType(dtype), scope) + self.buffer_var = tvm.tir.Var(name, buffer_ptr_type, span) - setup_buffer(*arg_list, span=tvm_span_from_synr(var_span)) - context.update_symbol(name, self.buffer, node) + setup_buffer_var(*arg_list, span=tvm_span_from_synr(var_span)) + context.update_symbol(name, self.buffer_var, node) @register @@ -176,7 +171,7 @@ def allocate_const(raw_data, dtype, shape, annotations=None, span=None): list_data.append(i.value) nd_data = tvm.nd.array(np.asarray(list_data, dtype=dtype)) n = tvm.tir.AllocateConst( - self.buffer.data, + self.buffer_var, dtype, shape, nd_data, @@ -187,7 +182,7 @@ def allocate_const(raw_data, dtype, shape, annotations=None, span=None): return n super().__init__(allocate_const, concise_scope=True, def_symbol=True) - self.buffer = None + self.buffer_var = None def enter_scope( self, @@ -211,17 +206,13 @@ def enter_scope( else: raise Exception("Internal Bug") - def setup_buffer(data, dtype, shape, annotations: dict = None, span: Span = None): + def setup_buffer_var(data, dtype, shape, annotations: dict = None, span: Span = None): """Setup buffer var for a given type.""" - self.buffer = tvm.tir.decl_buffer( - shape=shape, - dtype=dtype, - name=name, - span=span, - ) + buffer_ptr_type = tvm.ir.PointerType(tvm.ir.PrimType(dtype)) + self.buffer_var = tvm.tir.Var(name, buffer_ptr_type, span) - setup_buffer(*arg_list, span=tvm_span_from_synr(var_span)) - context.update_symbol(name, self.buffer, node) + setup_buffer_var(*arg_list, span=tvm_span_from_synr(var_span)) + context.update_symbol(name, self.buffer_var, node) @register @@ -248,7 +239,18 @@ def decl_buffer( axis_separators=None, span=None, ): - return tvm.tir.DeclBuffer(self.buffer, self.body, span=span) + decl_buffer = tvm.tir.DeclBuffer(self.buffer, self.body, span=span) + if data is None: + # when data is not specified, the buffer is implicitly allocated + return tvm.tir.Allocate( + self.buffer.data, + dtype, + shape, + tvm.runtime.convert(True), + decl_buffer, + span=span, + ) + return decl_buffer super().__init__(decl_buffer, concise_scope=True, def_symbol=True) @@ -298,6 +300,7 @@ def setup_buffer( offset_factor=offset_factor, buffer_type=buffer_type, axis_separators=axis_separators, + name=name, span=span, ) diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc index f5300e1e6985..5da81de4dc5d 100644 --- a/src/printer/tvmscript_printer.cc +++ b/src/printer/tvmscript_printer.cc @@ -100,6 +100,12 @@ class BufferUsageFinder : public StmtExprVisitor { StmtExprVisitor::VisitStmt_(op); } + void VisitStmt_(const DeclBufferNode* op) final { + buffers_declared_.insert(op->buffer.get()); + StmtExprVisitor::VisitStmt_(op); + buffers_declared_.erase(op->buffer.get()); + } + private: explicit BufferUsageFinder(Map> usage) : usage_(usage) {} @@ -107,6 +113,9 @@ class BufferUsageFinder : public StmtExprVisitor { if (buffers_visited_.count(buffer.get())) { return; } + if (buffers_declared_.count(buffer.get())) { + return; + } buffers_visited_.insert(buffer.get()); Array arr = usage_.Get(buffer->data).value_or({}); @@ -119,6 +128,9 @@ class BufferUsageFinder : public StmtExprVisitor { // The buffers that have been visited so far, to avoid duplicate // entries in the search result. std::unordered_set buffers_visited_; + // The buffers declared via `DeclBuffer`. These buffers are excluded from the result because + // T.buffer_decl shouldn't be printed for them. + std::unordered_set buffers_declared_; }; /*! @@ -1055,58 +1067,57 @@ Doc TVMScriptPrinter::VisitStmt_(const BufferRealizeNode* op) { } namespace { -struct AllocUsage { - Buffer alloc_buffer; - Array aliasing_buffers; -}; -template -AllocUsage FindAllocateUsage(AllocNode* op, Map>* cache_ptr) { - Map>& cache = *cache_ptr; - if (!cache.count(op->buffer_var)) { - cache = BufferUsageFinder::FindUsage(std::move(cache), op->body); +bool IsAllocateDeclBufferPattern(const AllocateNode* allocate) { + const Var& buffer_var = allocate->buffer_var; + const DeclBufferNode* decl_buffer = allocate->body.as(); + if (!decl_buffer) { + return false; } - Array buffer_usage = cache.Get(op->buffer_var).value_or({}); - - auto is_exact_match = [](Buffer a, Buffer b) { - if (a->dtype != b->dtype) return false; - if (a->shape.size() != b->shape.size()) return false; - - arith::Analyzer analyzer; - for (size_t i = 0; i < a->shape.size(); i++) { - if (!analyzer.CanProveEqual(a->shape[i], b->shape[i])) { - return false; - } - } - return true; - }; - - // If the buffer allocated via T.allocate is an exact match to the - // usage of the buffer later on, then that buffer is the return - // value of T.allocate, and no T.buffer_decl statement is needed. - Buffer alloc_buffer(op->buffer_var, op->dtype, op->extents, {}, 0, op->buffer_var->name_hint, 0, - 0, kDefault); - bool found_alloc_buf = false; - Array aliasing_buffers; - for (const auto& buf : buffer_usage) { - if (!found_alloc_buf && is_exact_match(buf, alloc_buffer)) { - alloc_buffer = buf; - found_alloc_buf = true; - } else { - aliasing_buffers.push_back(buf); + const Buffer& buffer = decl_buffer->buffer; + if (!buffer_var.same_as(buffer->data)) { + return false; + } + if (allocate->dtype != buffer->dtype) { + return false; + } + if (!is_one(allocate->condition)) { + return false; + } + if (allocate->annotations.size()) { + return false; + } + if (allocate->extents.size() != buffer->shape.size()) { + return false; + } + tir::ExprDeepEqual expr_equal; + for (size_t i = 0, n = allocate->extents.size(); i < n; ++i) { + if (!expr_equal(allocate->extents[i], buffer->shape[i])) { + return false; } } - - return AllocUsage{alloc_buffer, aliasing_buffers}; + return true; } + } // namespace Doc TVMScriptPrinter::VisitStmt_(const AllocateNode* op) { - auto usage = FindAllocateUsage(op, &buffer_var_usage_); - Buffer& alloc_buffer = usage.alloc_buffer; - Array& aliasing_buffers = usage.aliasing_buffers; - buf_not_in_headers_.insert(alloc_buffer.get()); - var_not_in_headers_.insert(alloc_buffer->data.get()); + var_not_in_headers_.insert(op->buffer_var.get()); + + if (!buffer_var_usage_.count(op->buffer_var)) { + buffer_var_usage_ = BufferUsageFinder::FindUsage(std::move(buffer_var_usage_), op->body); + } + Array buffer_usage = buffer_var_usage_.Get(op->buffer_var).value_or({}); + + if (buffer_usage.empty()) { + if (IsAllocateDeclBufferPattern(op)) { + // As a syntax sugar, we identify the pattern of Allocate and DeclBuffer and print a single + // DeclBuffer statement. It is intentionally to call `Print` instead of `PrintBody` here to + // delegate the printing of the current node to `DeclBufferNode` while maintaining the + // same value of `current_num_` and `num_child_`. + return Print(op->body); + } + } auto storage_scope = GetPtrStorageScope(op->buffer_var); Doc func_call; @@ -1124,12 +1135,12 @@ Doc TVMScriptPrinter::VisitStmt_(const AllocateNode* op) { Doc doc; if (current_num_ != num_child_ - 1) { - doc << "with " << func_call << " as " << Print(alloc_buffer) << ":"; - doc << Doc::Indent(4, Doc::NewLine() << PrintNonHeaderBufferDeclarations(aliasing_buffers) - << PrintBody(op->body)); + doc << "with " << func_call << " as " << Print(op->buffer_var) << ":"; + doc << Doc::Indent( + 4, Doc::NewLine() << PrintNonHeaderBufferDeclarations(buffer_usage) << PrintBody(op->body)); } else { - doc << Print(alloc_buffer) << " = " << func_call << Doc::NewLine(); - doc << PrintNonHeaderBufferDeclarations(aliasing_buffers) << PrintBody(op->body); + doc << Print(op->buffer_var) << " = " << func_call << Doc::NewLine(); + doc << PrintNonHeaderBufferDeclarations(buffer_usage) << PrintBody(op->body); } TryDeallocVar(op->buffer_var); return doc; @@ -1179,11 +1190,12 @@ Doc TVMScriptPrinter::VisitStmt_(const AllocateConstNode* alloc) { } auto ndarray_str = ss.str(); - auto usage = FindAllocateUsage(alloc, &buffer_var_usage_); - Buffer& alloc_buffer = usage.alloc_buffer; - Array& aliasing_buffers = usage.aliasing_buffers; - buf_not_in_headers_.insert(alloc_buffer.get()); - var_not_in_headers_.insert(alloc_buffer->data.get()); + var_not_in_headers_.insert(alloc->buffer_var.get()); + + if (!buffer_var_usage_.count(alloc->buffer_var)) { + buffer_var_usage_ = BufferUsageFinder::FindUsage(std::move(buffer_var_usage_), alloc->body); + } + Array buffer_usage = buffer_var_usage_.Get(alloc->buffer_var).value_or({}); Doc func_call; func_call << tir_prefix_ << ".allocate_const(" << ndarray_str << ", " << PrintDType(alloc->dtype) @@ -1192,12 +1204,12 @@ Doc TVMScriptPrinter::VisitStmt_(const AllocateConstNode* alloc) { Doc doc; var_not_in_headers_.insert(alloc->buffer_var.get()); if (current_num_ != num_child_ - 1) { - doc << "with " << func_call << " as " << Print(alloc_buffer) << ":"; - doc << Doc::Indent(4, Doc::NewLine() << PrintNonHeaderBufferDeclarations(aliasing_buffers) + doc << "with " << func_call << " as " << Print(alloc->buffer_var) << ":"; + doc << Doc::Indent(4, Doc::NewLine() << PrintNonHeaderBufferDeclarations(buffer_usage) << PrintBody(alloc->body)); } else { - doc << Print(alloc_buffer) << " = " << func_call << Doc::NewLine(); - doc << PrintNonHeaderBufferDeclarations(aliasing_buffers) << PrintBody(alloc->body); + doc << Print(alloc->buffer_var) << " = " << func_call << Doc::NewLine(); + doc << PrintNonHeaderBufferDeclarations(buffer_usage) << PrintBody(alloc->body); } return doc; } diff --git a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py index f348fd7f5a77..8c598fe0d794 100644 --- a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py +++ b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py @@ -40,14 +40,14 @@ def main() -> None: buffer9 = T.buffer_decl([32], "uint8") buffer10 = T.buffer_decl([2048], "int8") # body - p1 = T.allocate([128], "uint8", "global") - p2 = T.allocate([112], "uint8", "global") - p3 = T.allocate([112], "uint8", "global") - p4 = T.allocate([32], "uint8", "global") - p5 = T.allocate([32], "uint8", "global") - p6 = T.allocate([32], "uint8", "global") - p7 = T.allocate([112], "uint8", "global") - p8 = T.allocate([32], "uint8", "global") + p1 = T.decl_buffer([128], "uint8") + p2 = T.decl_buffer([112], "uint8") + p3 = T.decl_buffer([112], "uint8") + p4 = T.decl_buffer([32], "uint8") + p5 = T.decl_buffer([32], "uint8") + p6 = T.decl_buffer([32], "uint8") + p7 = T.decl_buffer([112], "uint8") + p8 = T.decl_buffer([32], "uint8") T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -88,14 +88,14 @@ def main() -> None: buffer9 = T.buffer_decl([32], "uint8") buffer10 = T.buffer_decl([2048], "int8") # body - p1 = T.allocate([128], "uint8", "global") - p2 = T.allocate([112], "uint8", "global") - p3 = T.allocate([112], "uint8", "global") - p4 = T.allocate([32], "uint8", "global") - p5 = T.allocate([32], "uint8", "global") - p6 = T.allocate([32], "uint8", "global") - p7 = T.allocate([112], "uint8", "global") - p8 = T.allocate([32], "uint8", "global") + p1 = T.decl_buffer([128], "uint8") + p2 = T.decl_buffer([112], "uint8") + p3 = T.decl_buffer([112], "uint8") + p4 = T.decl_buffer([32], "uint8") + p5 = T.decl_buffer([32], "uint8") + p6 = T.decl_buffer([32], "uint8") + p7 = T.decl_buffer([112], "uint8") + p8 = T.decl_buffer([32], "uint8") T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle")) @@ -134,14 +134,14 @@ def main() -> None: buffer9 = T.buffer_decl([32], "uint8") buffer10 = T.buffer_decl([2048], "int8") # body - p1 = T.allocate([128], "uint8", "global") - p2 = T.allocate([112], "uint8", "global") - p3 = T.allocate([112], "uint8", "global") - p4 = T.allocate([32], "uint8", "global") - p5 = T.allocate([32], "uint8", "global") - p6 = T.allocate([32], "uint8", "global") - p7 = T.allocate([112], "uint8", "global") - p8 = T.allocate([32], "uint8", "global") + p1 = T.decl_buffer([128], "uint8") + p2 = T.decl_buffer([112], "uint8") + p3 = T.decl_buffer([112], "uint8") + p4 = T.decl_buffer([32], "uint8") + p5 = T.decl_buffer([32], "uint8") + p6 = T.decl_buffer([32], "uint8") + p7 = T.decl_buffer([112], "uint8") + p8 = T.decl_buffer([32], "uint8") T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle")) @@ -166,11 +166,11 @@ def main() -> None: class AllOperatorsWithoutWeights: @T.prim_func def main() -> None: - T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer1 = T.buffer_decl([36], "int8") buffer2 = T.buffer_decl([9], "int8") # body - p1 = T.allocate([96], "int8", "global") + p1 = T.decl_buffer([96], "int8") T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 4, 3, 3, 0, 4, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 12, 3, 1, "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "int8", 3, 1, 3, 3, 0, 1, buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 3, 1, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) # fmt: on @@ -188,19 +188,19 @@ def test_all_operators_without_weights(max_copy_movements): class OperatorsWithAndWithoutWeights: @T.prim_func def main() -> None: - T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer1 = T.buffer_decl([97156], "int8") buffer2 = T.buffer_decl([80], "uint8") buffer3 = T.buffer_decl([64], "uint8") buffer4 = T.buffer_decl([96], "uint8") buffer5 = T.buffer_decl([32], "uint8") # body - p1 = T.allocate([390336], "int8", "global") - p2 = T.allocate([80], "uint8", "global") - p3 = T.allocate([64], "uint8", "global") - p4 = T.allocate([390336], "int8", "global") - p5 = T.allocate([96], "uint8", "global") - p6 = T.allocate([32], "uint8", "global") + p1 = T.decl_buffer([390336], "int8") + p2 = T.decl_buffer([80], "uint8") + p3 = T.decl_buffer([64], "uint8") + p4 = T.decl_buffer([390336], "int8") + p5 = T.decl_buffer([96], "uint8") + p6 = T.decl_buffer([32], "uint8") T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) @@ -230,12 +230,12 @@ def main() -> None: buffer4 = T.buffer_decl([96], "uint8") buffer5 = T.buffer_decl([32], "uint8") # body - p1 = T.allocate([390336], "int8", "global") - p2 = T.allocate([80], "uint8", "global") - p3 = T.allocate([64], "uint8", "global") - p4 = T.allocate([390336], "int8", "global") - p5 = T.allocate([96], "uint8", "global") - p6 = T.allocate([32], "uint8", "global") + p1 = T.decl_buffer([390336], "int8") + p2 = T.decl_buffer([80], "uint8") + p3 = T.decl_buffer([64], "uint8") + p4 = T.decl_buffer([390336], "int8") + p5 = T.decl_buffer([96], "uint8") + p6 = T.decl_buffer([32], "uint8") T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -256,19 +256,19 @@ def test_operators_with_and_without_weights_max_copy_movements_2(): class ReferenceModule: @T.prim_func def main() -> None: - T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer1 = T.buffer_decl([97156], "int8") buffer2 = T.buffer_decl([80], "uint8") buffer3 = T.buffer_decl([64], "uint8") buffer4 = T.buffer_decl([96], "uint8") buffer5 = T.buffer_decl([32], "uint8") # body - p1 = T.allocate([390336], "int8", "global") - p2 = T.allocate([80], "uint8", "global") - p3 = T.allocate([64], "uint8", "global") - p4 = T.allocate([390336], "int8", "global") - p5 = T.allocate([96], "uint8", "global") - p6 = T.allocate([32], "uint8", "global") + p1 = T.decl_buffer([390336], "int8") + p2 = T.decl_buffer([80], "uint8") + p3 = T.decl_buffer([64], "uint8") + p4 = T.decl_buffer([390336], "int8") + p5 = T.decl_buffer([96], "uint8") + p6 = T.decl_buffer([32], "uint8") T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle")) @@ -288,7 +288,7 @@ def main() -> None: class CopyToBufferWithLocalScope: @T.prim_func def main() -> None: - T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer1 = T.buffer_decl([64], "uint8") buffer2 = T.buffer_decl([48], "uint8") buffer3 = T.buffer_decl([48], "uint8") @@ -298,13 +298,13 @@ def main() -> None: buffer7 = T.buffer_decl([256], "uint8") buffer8 = T.buffer_decl([64], "uint8") # body - p1 = T.allocate([48], "uint8", "global") - p2 = T.allocate([48], "uint8", "global") - p3 = T.allocate([256], "int8", "local") - p4 = T.allocate([256], "int8", "global") - p5 = T.allocate([16], "uint8", "global") - p6 = T.allocate([48], "uint8", "global") - p7 = T.allocate([256], "int8", "local") + p1 = T.decl_buffer([48], "uint8") + p2 = T.decl_buffer([48], "uint8") + p3 = T.decl_buffer([256], "int8", scope="local") + p4 = T.decl_buffer([256], "int8") + p5 = T.decl_buffer([16], "uint8") + p6 = T.decl_buffer([48], "uint8") + p7 = T.decl_buffer([256], "int8", scope="local") T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) @@ -339,13 +339,13 @@ def main() -> None: buffer7 = T.buffer_decl([256], "uint8") buffer8 = T.buffer_decl([64], "uint8") # body - p1 = T.allocate([48], "uint8", "global") - p2 = T.allocate([48], "uint8", "global") - p3 = T.allocate([256], "int8", "local") - p4 = T.allocate([256], "int8", "global") - p5 = T.allocate([16], "uint8", "global") - p6 = T.allocate([48], "uint8", "global") - p7 = T.allocate([256], "int8", "local") + p1 = T.decl_buffer([48], "uint8") + p2 = T.decl_buffer([48], "uint8") + p3 = T.decl_buffer([256], "int8", scope="local") + p4 = T.decl_buffer([256], "int8") + p5 = T.decl_buffer([16], "uint8") + p6 = T.decl_buffer([48], "uint8") + p7 = T.decl_buffer([256], "int8", scope="local") T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) @@ -412,12 +412,12 @@ def main() -> None: buffer4 = T.buffer_decl([96], "uint8") buffer5 = T.buffer_decl([32], "uint8") # body - p1 = T.allocate([390336], "int8", "global") - p2 = T.allocate([80], "uint8", "global") - p3 = T.allocate([64], "uint8", "global") - p4 = T.allocate([390336], "int8", "global") - p5 = T.allocate([96], "uint8", "global") - p6 = T.allocate([32], "uint8", "global") + p1 = T.decl_buffer([390336], "int8") + p2 = T.decl_buffer([80], "uint8") + p3 = T.decl_buffer([64], "uint8") + p4 = T.decl_buffer([390336], "int8") + p5 = T.decl_buffer([96], "uint8") + p6 = T.decl_buffer([32], "uint8") T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -438,19 +438,19 @@ def test_pass_context_option_max_copy_movements(): class ReferenceModule: @T.prim_func def main() -> None: - T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer1 = T.buffer_decl([97156], "int8") buffer2 = T.buffer_decl([80], "uint8") buffer3 = T.buffer_decl([64], "uint8") buffer4 = T.buffer_decl([96], "uint8") buffer5 = T.buffer_decl([32], "uint8") # body - p1 = T.allocate([390336], "int8", "global") - p2 = T.allocate([80], "uint8", "global") - p3 = T.allocate([64], "uint8", "global") - p4 = T.allocate([390336], "int8", "global") - p5 = T.allocate([96], "uint8", "global") - p6 = T.allocate([32], "uint8", "global") + p1 = T.decl_buffer([390336], "int8") + p2 = T.decl_buffer([80], "uint8") + p3 = T.decl_buffer([64], "uint8") + p4 = T.decl_buffer([390336], "int8") + p5 = T.decl_buffer([96], "uint8") + p6 = T.decl_buffer([32], "uint8") T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle")) @@ -487,15 +487,15 @@ def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208 nn_4 = T.var("int32") nn_5 = T.var("int32") # body - placeholder_d_global = T.allocate([208], "uint8", "global") - placeholder_d_global_1 = T.allocate([112], "uint8", "global") - placeholder_d_global_2 = T.allocate([96], "uint8", "global") - placeholder_d_global_3 = T.allocate([112], "uint8", "global") - ethosu_write_1 = T.allocate([195168], "int8", "global") - ethosu_write_2 = T.allocate([184800], "int8", "global") - ethosu_write_3 = T.allocate([174688], "int8", "global") - ethosu_write_4 = T.allocate([174688], "int8", "global") - ethosu_write_5 = T.allocate([174688], "int8", "global") + placeholder_d_global = T.decl_buffer([208], "uint8") + placeholder_d_global_1 = T.decl_buffer([112], "uint8") + placeholder_d_global_2 = T.decl_buffer([96], "uint8") + placeholder_d_global_3 = T.decl_buffer([112], "uint8") + ethosu_write_1 = T.decl_buffer([195168], "int8") + ethosu_write_2 = T.decl_buffer([184800], "int8") + ethosu_write_3 = T.decl_buffer([174688], "int8") + ethosu_write_4 = T.decl_buffer([174688], "int8") + ethosu_write_5 = T.decl_buffer([174688], "int8") with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792): T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_global[0], dtype="handle")) with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 250): @@ -535,15 +535,15 @@ def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208 nn_4 = T.var("int32") nn_5 = T.var("int32") # body - placeholder_d_global = T.allocate([208], "uint8", "global") - placeholder_d_global_1 = T.allocate([112], "uint8", "global") - placeholder_d_global_2 = T.allocate([96], "uint8", "global") - placeholder_d_global_3 = T.allocate([112], "uint8", "global") - ethosu_write_1 = T.allocate([195168], "int8", "global") - ethosu_write_2 = T.allocate([184800], "int8", "global") - ethosu_write_3 = T.allocate([174688], "int8", "global") - ethosu_write_4 = T.allocate([174688], "int8", "global") - ethosu_write_5 = T.allocate([174688], "int8", "global") + placeholder_d_global = T.decl_buffer([208], "uint8") + placeholder_d_global_1 = T.decl_buffer([112], "uint8") + placeholder_d_global_2 = T.decl_buffer([96], "uint8") + placeholder_d_global_3 = T.decl_buffer([112], "uint8") + ethosu_write_1 = T.decl_buffer([195168], "int8") + ethosu_write_2 = T.decl_buffer([184800], "int8") + ethosu_write_3 = T.decl_buffer([174688], "int8") + ethosu_write_4 = T.decl_buffer([174688], "int8") + ethosu_write_5 = T.decl_buffer([174688], "int8") with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792): T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_global[0], dtype="handle")) with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 1024): @@ -589,17 +589,17 @@ def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208 nn_4 = T.var("int32") nn_5 = T.var("int32") # body - placeholder_d_d_global = T.allocate([208], "uint8", "global") - placeholder_d_d_global_1 = T.allocate([112], "uint8", "global") - placeholder_d_global = T.allocate([96], "uint8", "global") - ethosu_write_1 = T.allocate([195168], "int8", "global") - placeholder_local = T.allocate([256], "int8", "local") - ethosu_write_2 = T.allocate([184800], "int8", "global") - ethosu_write_3 = T.allocate([184800], "int8", "global") - ethosu_write_4 = T.allocate([184800], "int8", "global") - placeholder_d_local = T.allocate([256], "int8", "local") - ethosu_write_5 = T.allocate([184800], "int8", "global") - placeholder_d_d_local = T.allocate([256], "int8", "local") + placeholder_d_d_global = T.decl_buffer([208], "uint8") + placeholder_d_d_global_1 = T.decl_buffer([112], "uint8") + placeholder_d_global = T.decl_buffer([96], "uint8") + ethosu_write_1 = T.decl_buffer([195168], "int8") + placeholder_local = T.decl_buffer([256], "int8", scope="local") + ethosu_write_2 = T.decl_buffer([184800], "int8") + ethosu_write_3 = T.decl_buffer([184800], "int8") + ethosu_write_4 = T.decl_buffer([184800], "int8") + placeholder_d_local = T.decl_buffer([256], "int8", scope="local") + ethosu_write_5 = T.decl_buffer([184800], "int8") + placeholder_d_d_local = T.decl_buffer([256], "int8", scope="local") with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792): T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_d_global[0], dtype="handle")) with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 73668): @@ -639,17 +639,17 @@ def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208 nn_4 = T.var("int32") nn_5 = T.var("int32") # body - placeholder_d_d_global = T.allocate([208], "uint8", "global") - placeholder_d_d_global_1 = T.allocate([112], "uint8", "global") - placeholder_d_global = T.allocate([96], "uint8", "global") - ethosu_write_1 = T.allocate([195168], "int8", "global") - placeholder_local = T.allocate([256], "int8", "local") - ethosu_write_2 = T.allocate([184800], "int8", "global") - ethosu_write_3 = T.allocate([184800], "int8", "global") - ethosu_write_4 = T.allocate([184800], "int8", "global") - placeholder_d_local = T.allocate([256], "int8", "local") - ethosu_write_5 = T.allocate([184800], "int8", "global") - placeholder_d_d_local = T.allocate([256], "int8", "local") + placeholder_d_d_global = T.decl_buffer([208], "uint8") + placeholder_d_d_global_1 = T.decl_buffer([112], "uint8") + placeholder_d_global = T.decl_buffer([96], "uint8") + ethosu_write_1 = T.decl_buffer([195168], "int8") + placeholder_local = T.decl_buffer([256], "int8", scope="local") + ethosu_write_2 = T.decl_buffer([184800], "int8") + ethosu_write_3 = T.decl_buffer([184800], "int8") + ethosu_write_4 = T.decl_buffer([184800], "int8") + placeholder_d_local = T.decl_buffer([256], "int8", scope="local") + ethosu_write_5 = T.decl_buffer([184800], "int8") + placeholder_d_d_local = T.decl_buffer([256], "int8", scope="local") with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792): T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_d_global[0], dtype="handle")) with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 384): diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py index fd9f373739e1..6ffbf22312ff 100644 --- a/tests/python/contrib/test_ethosu/test_encode_constants.py +++ b/tests/python/contrib/test_ethosu/test_encode_constants.py @@ -43,8 +43,10 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), buffer7 = T.buffer_decl([144], "uint8") buffer8 = T.buffer_decl([32], "uint8") # body - p1 = T.allocate([160], "uint8", "global", annotations={"disable_lower_builtin":True}) - p2 = T.allocate([144], "uint8", "global", annotations={"disable_lower_builtin":True}) + p1_data = T.allocate([160], "uint8", "global", annotations={"disable_lower_builtin":True}) + p1 = T.buffer_decl([160], "uint8", data=p1_data) + p2_data = T.allocate([144], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.buffer_decl([144], "uint8", data=p2_data) buffer9 = T.buffer_decl([144], "uint8", data=p1.data) T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 160, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 144, p2[0], dtype="handle")) @@ -69,8 +71,10 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), buffer_encoded_4_1 = T.buffer_decl([208], dtype="uint8") buffer_encoded_6_1 = T.buffer_decl([192], dtype="uint8") # body - p1 = T.allocate([208], "uint8", "global", annotations={"disable_lower_builtin":True}) - p2 = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin":True}) + p1_data = T.allocate([208], "uint8", "global", annotations={"disable_lower_builtin":True}) + p1 = T.buffer_decl([208], "uint8", data=p1_data) + p2_data = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.buffer_decl([192], "uint8", data=p2_data) p3 = T.buffer_decl([192], dtype="uint8", data=p1.data) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 192, p3[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 192, p2[0], dtype="handle")) @@ -149,8 +153,10 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer1 = T.buffer_decl([384], "uint8") # body - p1 = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True}) - p2 = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True}) + p1_data = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True}) + p1 = T.buffer_decl([384], "uint8", data=p1_data) + p2_data = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.buffer_decl([384], "uint8", data=p2_data) T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 384, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 384, p2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 304, T.int8(-1), T.int8(-1), 12, p1[304], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -167,8 +173,10 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), # buffer definition placeholder_encoded_1 = T.buffer_decl([464], "uint8") # body - p1 = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True}) - p2 = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True}) + p1_data = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True}) + p1 = T.buffer_decl([464], "uint8", data=p1_data) + p2_data = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.buffer_decl([464], "uint8", data=p2_data) T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 464, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 464, p2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -246,7 +254,8 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), buffer_2 = T.buffer_decl([160], "uint8") buffer_3 = T.buffer_decl([80], "uint8") # body - ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + ethosu_write_1_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + ethosu_write_1 = T.buffer_decl([4096], "int8", data=ethosu_write_1_data) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 160, T.int8(-1), T.int8(-1), 12, buffer_3[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -264,7 +273,8 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), placeholder_encoded_2 = T.buffer_decl([208], dtype="uint8") placeholder_encoded_3 = T.buffer_decl([96], dtype="uint8") # body - ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + ethosu_write_2_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + ethosu_write_2 = T.buffer_decl([4096], "int8", data=ethosu_write_2_data) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded[0], 304, placeholder_encoded[304], 304, 12, placeholder_encoded_1[0], 80, placeholder_encoded_1[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded_2[0], 112, placeholder_encoded_2[112], 96, 12, placeholder_encoded_3[0], 48, placeholder_encoded_3[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -340,9 +350,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer_encoded: T.Buffer[(112,) buffer10 = T.buffer_decl([160], "uint8") buffer11 = T.buffer_decl([2048], "int8") # body - p1 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True}) - p3 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) - p2 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True}) + p1_data = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True}) + p1 = T.buffer_decl([112], "uint8", data=p1_data) + p3_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.buffer_decl([4096], "int8", data=p3_data) + p2_data = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.buffer_decl([112], "uint8", data=p2_data) T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 112, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 592, T.int8(-1), T.int8(-1), 12, buffer10[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 112, p2[0], dtype="handle")) @@ -369,9 +382,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer_encoded: T.Buffer[(128,) buffer4 = T.buffer_decl([608], dtype="uint8") buffer5 = T.buffer_decl([160], dtype="uint8") buffer6 = T.buffer_decl([2048], dtype="int8") - p1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) - p2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) - p3 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) + p1_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) + p1 = T.buffer_decl([128], "uint8", data=p1_data) + p2_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.buffer_decl([4096], "int8", data=p2_data) + p3_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.buffer_decl([128], "uint8", data=p3_data) T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer4[0], 304, buffer4[304], 304, 12, buffer5[0], 80, buffer5[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p3[0], dtype="handle")) diff --git a/tests/python/contrib/test_ethosu/test_hoist_allocates.py b/tests/python/contrib/test_ethosu/test_hoist_allocates.py index b54b92950180..6c6d51fa06b9 100644 --- a/tests/python/contrib/test_ethosu/test_hoist_allocates.py +++ b/tests/python/contrib/test_ethosu/test_hoist_allocates.py @@ -116,15 +116,20 @@ def main(placeholder: T.Buffer[(3402,), "int8"], placeholder_encoded: T.Buffer[( T.preflattened_buffer(placeholder_encoded_3, [3, 10], dtype="uint8") T.preflattened_buffer(ethosu_write, [1, 27, 42, 3], dtype="int8", data=ethosu_write.data) # body - placeholder_global = T.allocate([128], "uint8", "global") + placeholder_global_data = T.allocate([128], "uint8", "global") + placeholder_global = T.buffer_decl([128], "uint8", data=placeholder_global_data) T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 128, placeholder_global[0], dtype="handle")) - placeholder_d_global = T.allocate([32], "uint8", "global") + placeholder_d_global_data = T.allocate([32], "uint8", "global") + placeholder_d_global = T.buffer_decl([32], "uint8", data=placeholder_d_global_data) T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 32, placeholder_d_global[0], dtype="handle")) - ethosu_write_2 = T.allocate([18144], "int8", "global") + ethosu_write_2_data = T.allocate([18144], "int8", "global") + ethosu_write_2 = T.buffer_decl([18144], "int8", data=ethosu_write_2_data) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 27, 42, 3, 27, 0, 42, placeholder[0], 0, 0, 0, T.float32(0.0039215646684169769), -128, "NHWC", 126, 3, 1, "int8", 27, 42, 3, 27, 0, 42, ethosu_write_2[0], 0, 0, 0, T.float32(0.031308155506849289), -128, "NHCWB16", 672, 16, 1, 2, 3, 1, 1, 1, 2, placeholder_global[0], 128, 0, placeholder_d_global[0], 32, 2, 0, 2, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle")) - placeholder_d_global_1 = T.allocate([128], "uint8", "global") + placeholder_d_global_1_data = T.allocate([128], "uint8", "global") + placeholder_d_global_1 = T.buffer_decl([128], "uint8", data=placeholder_d_global_1_data) T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 128, placeholder_d_global_1[0], dtype="handle")) - placeholder_d_global_2 = T.allocate([32], "uint8", "global") + placeholder_d_global_2_data = T.allocate([32], "uint8", "global") + placeholder_d_global_2 = T.buffer_decl([32], "uint8", data=placeholder_d_global_2_data) T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_3[0], 32, placeholder_d_global_2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 27, 42, 3, 27, 0, 42, ethosu_write_2[0], 0, 0, 0, T.float32(0.031308155506849289), -128, "NHCWB16", 672, 16, 1, "int8", 27, 42, 3, 27, 0, 42, ethosu_write[0], 0, 0, 0, T.float32(0.23604340851306915), -128, "NHWC", 126, 3, 1, 2, 3, 1, 1, 1, 2, placeholder_d_global_1[0], 128, 0, placeholder_d_global_2[0], 32, 2, 0, 2, 1, "CLIP", -128, 127, "TFL", "NONE", dtype="handle")) # fmt: on @@ -151,14 +156,18 @@ def main(placeholder: T.Buffer[(24,), "int8"], T_concat: T.Buffer[(24,), "int8"] T.preflattened_buffer(placeholder, [1, 2, 3, 4], dtype="int8", data=placeholder.data) T.preflattened_buffer(T_concat, [24], dtype="int8", data=T_concat.data) # body - ethosu_write = T.allocate([12], "int8", "global") + ethosu_write_data = T.allocate([12], "int8", "global") + ethosu_write = T.buffer_decl([12], "int8", data=ethosu_write_data) T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, placeholder[12], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle")) - ethosu_write_1 = T.allocate([12], "int8", "global") + ethosu_write_1_data = T.allocate([12], "int8", "global") + ethosu_write_1 = T.buffer_decl([12], "int8", data=ethosu_write_1_data) T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write_1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle")) T.evaluate(T.call_extern("ethosu_identity", "int8", 12, 1, 1, 12, 0, 1, ethosu_write_1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "int8", 12, 1, 1, 12, 0, 1, T_concat[12], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle")) - ethosu_write_2 = T.allocate([12], "int8", "global") + ethosu_write_2_data = T.allocate([12], "int8", "global") + ethosu_write_2 = T.buffer_decl([12], "int8", data=ethosu_write_2_data) T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, placeholder[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle")) - ethosu_write_3 = T.allocate([12], "int8", "global") + ethosu_write_3_data = T.allocate([12], "int8", "global") + ethosu_write_3 = T.buffer_decl([12], "int8", data=ethosu_write_3_data) T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle")) T.evaluate(T.call_extern("ethosu_identity", "int8", 12, 1, 1, 12, 0, 1, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "int8", 12, 1, 1, 12, 0, 1, T_concat[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle")) # fmt: on @@ -185,24 +194,32 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data) # body - with T.allocate([128], "uint8", "global") as placeholder_global: + with T.allocate([128], "uint8", "global") as placeholder_global_data: + placeholder_global = T.buffer_decl([128], "uint8", data=placeholder_global_data) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 128, placeholder_global[0], dtype="handle")) - placeholder_d_global = T.allocate([32], "uint8", "global") + placeholder_d_global_data = T.allocate([32], "uint8", "global") + placeholder_d_global = T.buffer_decl([32], "uint8", data=placeholder_d_global_data) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 128, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - with T.allocate([112], "uint8", "global") as placeholder_global_1: + with T.allocate([112], "uint8", "global") as placeholder_global_1_data: + placeholder_global_1 = T.buffer_decl([112], "uint8", data=placeholder_global_1_data) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 112, placeholder_global_1[0], dtype="handle")) - placeholder_d_global_1 = T.allocate([32], "uint8", "global") + placeholder_d_global_1_data = T.allocate([32], "uint8", "global") + placeholder_d_global_1 = T.buffer_decl([32], "uint8", data=placeholder_d_global_1_data) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 112, 12, placeholder_d_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - with T.allocate([112], "uint8", "global") as placeholder_global_2: + with T.allocate([112], "uint8", "global") as placeholder_global_2_data: + placeholder_global_2 = T.buffer_decl([112], "uint8", data=placeholder_global_2_data) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 112, placeholder_global_2[0], dtype="handle")) - placeholder_d_global_2 = T.allocate([32], "uint8", "global") + placeholder_d_global_2_data = T.allocate([32], "uint8", "global") + placeholder_d_global_2 = T.buffer_decl([32], "uint8", data=placeholder_d_global_2_data) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 112, 12, placeholder_d_global_2[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - placeholder_global_3 = T.allocate([112], "uint8", "global") + placeholder_global_3_data = T.allocate([112], "uint8", "global") + placeholder_global_3 = T.buffer_decl([112], "uint8", data=placeholder_global_3_data) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 112, placeholder_global_3[0], dtype="handle")) - placeholder_d_global_3 = T.allocate([32], "uint8", "global") + placeholder_d_global_3_data = T.allocate([32], "uint8", "global") + placeholder_d_global_3 = T.buffer_decl([32], "uint8", data=placeholder_d_global_3_data) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 112, 12, placeholder_d_global_3[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) # fmt: on @@ -227,13 +244,20 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data) # body - placeholder_global = T.allocate([128], "uint8", "global") - placeholder_global_1 = T.allocate([112], "uint8", "global") - placeholder_global_2 = T.allocate([112], "uint8", "global") - placeholder_d_global = T.allocate([32], "uint8", "global") - placeholder_d_global_1 = T.allocate([32], "uint8", "global") - placeholder_d_global_2 = T.allocate([32], "uint8", "global") - placeholder_global_3 = T.allocate([112], "uint8", "global") + placeholder_global_data = T.allocate([128], "uint8", "global") + placeholder_global = T.buffer_decl([128], "uint8", data=placeholder_global_data) + placeholder_global_1_data = T.allocate([112], "uint8", "global") + placeholder_global_1 = T.buffer_decl([112], "uint8", data=placeholder_global_1_data) + placeholder_global_2_data = T.allocate([112], "uint8", "global") + placeholder_global_2 = T.buffer_decl([112], "uint8", data=placeholder_global_2_data) + placeholder_d_global_data = T.allocate([32], "uint8", "global") + placeholder_d_global = T.buffer_decl([32], "uint8", data=placeholder_d_global_data) + placeholder_d_global_1_data = T.allocate([32], "uint8", "global") + placeholder_d_global_1 = T.buffer_decl([32], "uint8", data=placeholder_d_global_1_data) + placeholder_d_global_2_data = T.allocate([32], "uint8", "global") + placeholder_d_global_2 = T.buffer_decl([32], "uint8", data=placeholder_d_global_2_data) + placeholder_global_3_data = T.allocate([112], "uint8", "global") + placeholder_global_3 = T.buffer_decl([112], "uint8", data=placeholder_global_3_data) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 128, placeholder_global[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 128, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -242,7 +266,8 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 112, 12, placeholder_d_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 112, placeholder_global_2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle")) - placeholder_d_global_3 = T.allocate([32], "uint8", "global") + placeholder_d_global_3_data = T.allocate([32], "uint8", "global") + placeholder_d_global_3 = T.buffer_decl([32], "uint8", data=placeholder_d_global_3_data) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 112, 12, placeholder_d_global_2[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 112, placeholder_global_3[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle")) diff --git a/tests/python/contrib/test_ethosu/test_merge_constants.py b/tests/python/contrib/test_ethosu/test_merge_constants.py index caf09abdb020..337b5c70d125 100644 --- a/tests/python/contrib/test_ethosu/test_merge_constants.py +++ b/tests/python/contrib/test_ethosu/test_merge_constants.py @@ -44,8 +44,10 @@ def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"]) buffer1 = T.buffer_decl([8192], "int8") buffer10 = T.buffer_decl([2048], "int8") # body - p1 = T.allocate([128], "uint8", "global") - p4 = T.allocate([32], "uint8", "global") + p1_data = T.allocate([128], "uint8", "global") + p1 = T.buffer_decl([128], "uint8", data=p1_data) + p4_data = T.allocate([32], "uint8", "global") + p4 = T.buffer_decl([32], "uint8", data=p4_data) T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -60,7 +62,8 @@ def main(buffer2: T.Buffer[(160,), "uint8"]) -> None: buffer1 = T.buffer_decl([8192], "int8") buffer10 = T.buffer_decl([2048], "int8") # body - p4 = T.allocate([160], "uint8", "global") + p4_data = T.allocate([160], "uint8", "global") + p4 = T.buffer_decl([160], "uint8", data=p4_data) T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 128, 12, p4[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) # fmt: on @@ -86,14 +89,22 @@ def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"], buffer1 = T.buffer_decl([8192], "int8") buffer10 = T.buffer_decl([2048], "int8") # body - p1 = T.allocate([128], "uint8", "global") - p2 = T.allocate([112], "uint8", "global") - p3 = T.allocate([112], "uint8", "global") - p4 = T.allocate([32], "uint8", "global") - p5 = T.allocate([32], "uint8", "global") - p6 = T.allocate([32], "uint8", "global") - p7 = T.allocate([112], "uint8", "global") - p8 = T.allocate([3], "uint8", "global") + p1_data = T.allocate([128], "uint8", "global") + p1 = T.buffer_decl([128], "uint8", data=p1_data) + p2_data = T.allocate([112], "uint8", "global") + p2 = T.buffer_decl([112], "uint8", data=p2_data) + p3_data = T.allocate([112], "uint8", "global") + p3 = T.buffer_decl([112], "uint8", data=p3_data) + p4_data = T.allocate([32], "uint8", "global") + p4 = T.buffer_decl([32], "uint8", data=p4_data) + p5_data = T.allocate([32], "uint8", "global") + p5 = T.buffer_decl([32], "uint8", data=p5_data) + p6_data = T.allocate([32], "uint8", "global") + p6 = T.buffer_decl([32], "uint8", data=p6_data) + p7_data = T.allocate([112], "uint8", "global") + p7 = T.buffer_decl([112], "uint8", data=p7_data) + p8_data = T.allocate([3], "uint8", "global") + p8 = T.buffer_decl([3], "uint8", data=p8_data) T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle")) @@ -117,10 +128,14 @@ def main(buffer2: T.Buffer[(160,), "uint8"], buffer4: T.Buffer[(144,), "uint8"], buffer1 = T.buffer_decl([8192], "int8") buffer10 = T.buffer_decl([2048], "int8") # body - p4 = T.allocate([160], "uint8", "global") - p7 = T.allocate([144], "uint8", "global") - p10 = T.allocate([144], "uint8", "global") - p11 = T.allocate([144], "uint8", "global") + p4_data = T.allocate([160], "uint8", "global") + p4 = T.buffer_decl([160], "uint8", data=p4_data) + p7_data = T.allocate([144], "uint8", "global") + p7 = T.buffer_decl([144], "uint8", data=p7_data) + p10_data = T.allocate([144], "uint8", "global") + p10 = T.buffer_decl([144], "uint8", data=p10_data) + p11_data = T.allocate([144], "uint8", "global") + p11 = T.buffer_decl([144], "uint8", data=p11_data) T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 144, p7[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 128, 12, p4[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -159,13 +174,15 @@ def test_operators_with_and_without_weights(): class InputModule: @T.prim_func def main(buffer2: T.Buffer[(80,), "uint8"], buffer3: T.Buffer[(64,), "uint8"]) -> None: - T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer0 = T.buffer_decl([390336], "int8") buffer1 = T.buffer_decl([97156], "int8") buffer6 = T.buffer_decl([390336], "int8") # body - p2 = T.allocate([80], "uint8", "global") - p3 = T.allocate([64], "uint8", "global") + p2_data = T.allocate([80], "uint8", "global") + p2 = T.buffer_decl([80], "uint8", data=p2_data) + p3_data = T.allocate([64], "uint8", "global") + p3 = T.buffer_decl([64], "uint8", data=p3_data) T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) @@ -176,12 +193,13 @@ def main(buffer2: T.Buffer[(80,), "uint8"], buffer3: T.Buffer[(64,), "uint8"]) - class ReferenceModule: @T.prim_func def main(buffer2: T.Buffer[(144,), "uint8"]) -> None: - T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer0 = T.buffer_decl([390336], "int8") buffer1 = T.buffer_decl([97156], "int8") buffer6 = T.buffer_decl([390336], "int8") # body - p3 = T.allocate([144], "uint8", "global") + p3_data = T.allocate([144], "uint8", "global") + p3 = T.buffer_decl([144], "uint8", data=p3_data) T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 144, p3[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, buffer6[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p3[0], 80, 0, p3[80], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -203,8 +221,8 @@ def test_copy_to_buffer_with_local_scope(): @tvm.script.ir_module class InputModule: @T.prim_func - def main(buffer1: T.Buffer[(64,), "uint8"], - buffer2: T.Buffer[(48,), "uint8"], + def main(buffer1: T.Buffer[(64,), "uint8"], + buffer2: T.Buffer[(48,), "uint8"], buffer3: T.Buffer[(256,), "uint8"], buffer4: T.Buffer[(256,), "uint8"], buffer5: T.Buffer[(16,), "uint8"], @@ -215,12 +233,18 @@ def main(buffer1: T.Buffer[(64,), "uint8"], ) -> None: T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) # body - p1 = T.allocate([48], "uint8", "global") - p2 = T.allocate([48], "uint8", "global") - p3 = T.allocate([256], "int8", "local") - p5 = T.allocate([16], "uint8", "global") - p6 = T.allocate([48], "uint8", "global") - p7 = T.allocate([256], "int8", "local") + p1_data = T.allocate([48], "uint8", "global") + p1 = T.buffer_decl([48], "uint8", data=p1_data) + p2_data = T.allocate([48], "uint8", "global") + p2 = T.buffer_decl([48], "uint8", data=p2_data) + p3_data = T.allocate([256], "int8", "local") + p3 = T.buffer_decl([256], "int8", data=p3_data, scope="local") + p5_data = T.allocate([16], "uint8", "global") + p5 = T.buffer_decl([16], "uint8", data=p5_data) + p6_data = T.allocate([48], "uint8", "global") + p6 = T.buffer_decl([48], "uint8", data=p6_data) + p7_data = T.allocate([256], "int8", "local") + p7 = T.buffer_decl([256], "int8", data=p7_data, scope="local") T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) # Local @@ -234,8 +258,8 @@ def main(buffer1: T.Buffer[(64,), "uint8"], @tvm.script.ir_module class ReferenceModule: @T.prim_func - def main(buffer1: T.Buffer[(64,), "uint8"], - buffer2: T.Buffer[(96,), "uint8"], + def main(buffer1: T.Buffer[(64,), "uint8"], + buffer2: T.Buffer[(96,), "uint8"], buffer4: T.Buffer[(256,), "uint8"], buffer5: T.Buffer[(64,), "uint8"], buffer7: T.Buffer[(256,), "uint8"], @@ -244,10 +268,14 @@ def main(buffer1: T.Buffer[(64,), "uint8"], ) -> None: T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) # body - p1 = T.allocate([96], "uint8", "global") - p2 = T.allocate([64], "uint8", "global") - p3 = T.allocate([256], "int8", "local") - p7 = T.allocate([256], "int8", "local") + p1_data = T.allocate([96], "uint8", "global") + p1 = T.buffer_decl([96], "uint8", data=p1_data) + p2_data = T.allocate([64], "uint8", "global") + p2 = T.buffer_decl([64], "uint8", data=p2_data) + p3_data = T.allocate([256], "int8", "local") + p3 = T.buffer_decl([256], "int8", data=p3_data, scope="local") + p7_data = T.allocate([256], "int8", "local") + p7 = T.buffer_decl([256], "int8", data=p7_data, scope="local") T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) # Local T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 64, p2[0], dtype="handle")) @@ -287,10 +315,11 @@ def main() -> None: placeholder = T.buffer_decl([20], "int8") ethosu_write = T.buffer_decl([16], "int8") # body - ethosu_write_4 = T.allocate([16], "int8", "global") + ethosu_write_4_data = T.allocate([16], "int8", "global") + ethosu_write_4 = T.buffer_decl([16], "int8", data=ethosu_write_4_data) T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 1, 4, 4, 1, 0, 4, placeholder[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "int8", 1, 4, 1, 1, 0, 4, placeholder[16], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 1, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "MAX", 0, "CLIP", -128, 127, "TFL", 1, 4, 4, dtype="handle")) T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - + @tvm.script.ir_module class ReferenceModule: @T.prim_func @@ -300,7 +329,8 @@ def main() -> None: placeholder = T.buffer_decl([20], "int8") ethosu_write = T.buffer_decl([16], "int8") # body - ethosu_write_4 = T.allocate([16], "int8", "global") + ethosu_write_4_data = T.allocate([16], "int8", "global") + ethosu_write_4 = T.buffer_decl([16], "int8", data=ethosu_write_4_data) T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 1, 4, 4, 1, 0, 4, placeholder[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "int8", 1, 4, 1, 1, 0, 4, placeholder[16], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 1, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "MAX", 0, "CLIP", -128, 127, "TFL", 1, 4, 4, dtype="handle")) T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) # fmt: on @@ -324,8 +354,10 @@ def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"]) buffer1 = T.buffer_decl([8192], "int8") buffer10 = T.buffer_decl([2048], "int8") # body - p1 = T.allocate([128], "uint8", "global") - p4 = T.allocate([32], "uint8", "global") + p1_data = T.allocate([128], "uint8", "global") + p1 = T.buffer_decl([128], "uint8", data=p1_data) + p4_data = T.allocate([32], "uint8", "global") + p4 = T.buffer_decl([32], "uint8", data=p4_data) T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -343,7 +375,8 @@ def main(buffer2: T.Buffer[(160,), "uint8"]) -> None: buffer1 = T.buffer_decl([8192], "int8") buffer10 = T.buffer_decl([2048], "int8") # body - p5 = T.allocate([160], "uint8", "global") + p5_data = T.allocate([160], "uint8", "global") + p5 = T.buffer_decl([160], "uint8", data=p5_data) T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p5[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5[0], 128, 12, p5[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p5[0], dtype="handle")) @@ -373,8 +406,10 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data) # body - p1 = T.allocate([368], "uint8", "global") - p2 = T.allocate([96], "uint8", "global") + p1_data = T.allocate([368], "uint8", "global") + p1 = T.buffer_decl([368], "uint8", data=p1_data) + p2_data = T.allocate([96], "uint8", "global") + p2 = T.buffer_decl([96], "uint8", data=p2_data) T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 368, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p2[0], 48, p2[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -388,7 +423,8 @@ def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(464,), "uint # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) # body - p1 = T.allocate([464], "uint8", "global") + p1_data = T.allocate([464], "uint8", "global") + p1 = T.buffer_decl([464], "uint8", data=p1_data) T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 464, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -428,14 +464,22 @@ def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"], buffer1 = T.buffer_decl([8192], "int8") buffer10 = T.buffer_decl([2048], "int8") # body - p1 = T.allocate([128], "uint8", "global") - p2 = T.allocate([112], "uint8", "global") - p3 = T.allocate([112], "uint8", "global") - p4 = T.allocate([32], "uint8", "global") - p5 = T.allocate([32], "uint8", "global") - p6 = T.allocate([32], "uint8", "global") - p7 = T.allocate([112], "uint8", "global") - p8 = T.allocate([3], "uint8", "global") + p1_data = T.allocate([128], "uint8", "global") + p1 = T.buffer_decl([128], "uint8", data=p1_data) + p2_data = T.allocate([112], "uint8", "global") + p2 = T.buffer_decl([112], "uint8", data=p2_data) + p3_data = T.allocate([112], "uint8", "global") + p3 = T.buffer_decl([112], "uint8", data=p3_data) + p4_data = T.allocate([32], "uint8", "global") + p4 = T.buffer_decl([32], "uint8", data=p4_data) + p5_data = T.allocate([32], "uint8", "global") + p5 = T.buffer_decl([32], "uint8", data=p5_data) + p6_data = T.allocate([32], "uint8", "global") + p6 = T.buffer_decl([32], "uint8", data=p6_data) + p7_data = T.allocate([112], "uint8", "global") + p7 = T.buffer_decl([112], "uint8", data=p7_data) + p8_data = T.allocate([3], "uint8", "global") + p8 = T.buffer_decl([3], "uint8", data=p8_data) with T.attr(T.iter_var(v1a, None, "DataPar", ""), "pragma_compute_cycles_hint", 100): T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) with T.attr(T.iter_var(v1b, None, "DataPar", ""), "pragma_compute_cycles_hint", 101): @@ -479,10 +523,14 @@ def main(buffer2: T.Buffer[(160,), "uint8"], buffer4: T.Buffer[(144,), "uint8"], buffer1 = T.buffer_decl([8192], "int8") buffer10 = T.buffer_decl([2048], "int8") # body - p4 = T.allocate([160], "uint8", "global") - p7 = T.allocate([144], "uint8", "global") - p10 = T.allocate([144], "uint8", "global") - p11 = T.allocate([144], "uint8", "global") + p4_data = T.allocate([160], "uint8", "global") + p4 = T.buffer_decl([160], "uint8", data=p4_data) + p7_data = T.allocate([144], "uint8", "global") + p7 = T.buffer_decl([144], "uint8", data=p7_data) + p10_data = T.allocate([144], "uint8", "global") + p10 = T.buffer_decl([144], "uint8", data=p10_data) + p11_data = T.allocate([144], "uint8", "global") + p11 = T.buffer_decl([144], "uint8", data=p11_data) with T.attr(T.iter_var(v1a, None, "DataPar", ""), "pragma_compute_cycles_hint", 201): T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle")) with T.attr(T.iter_var(v2a, None, "DataPar", ""), "pragma_compute_cycles_hint", 205): diff --git a/tests/python/contrib/test_ethosu/test_remove_concatenates.py b/tests/python/contrib/test_ethosu/test_remove_concatenates.py index d2c759a0ae4d..e6414c24d4a3 100644 --- a/tests/python/contrib/test_ethosu/test_remove_concatenates.py +++ b/tests/python/contrib/test_ethosu/test_remove_concatenates.py @@ -42,7 +42,8 @@ def main(placeholder: T.Buffer[(1536,), "int8"], placeholder_1: T.Buffer[(1280,) buffer_6 = T.buffer_decl([2992], "uint8") buffer_7 = T.buffer_decl([160], "uint8") # body - T_concat_1 = T.allocate([2816], "int8", "global", annotations={"disable_lower_builtin":True}) + T_concat_1_data = T.allocate([2816], "int8", "global", annotations={"disable_lower_builtin":True}) + T_concat_1 = T.buffer_decl([2816], "int8", data=T_concat_1_data) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, placeholder_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 160, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat[352], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_3[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 12, 16, 8, 0, 12, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 192, 16, 1, "int8", 8, 12, 16, 8, 0, 12, T_concat_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer_4[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_5[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py index 46a3c5a15bf5..ae46057369e0 100644 --- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py +++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py @@ -374,7 +374,8 @@ def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512, buffer_2 = T.buffer_decl([320], "uint8") buffer_3 = T.buffer_decl([160], "uint8") # body - ethosu_write_2 = T.allocate([1024], "int8", "global", annotations={"disable_lower_builtin": True}) + ethosu_write_2_data = T.allocate([1024], "int8", "global", annotations={"disable_lower_builtin": True}) + ethosu_write_2 = T.buffer_decl([1024], "int8", data=ethosu_write_2_data) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, buffer[0], 304, T.int8(-1), T.int8(-1), 12, buffer_1[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[12], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -393,7 +394,8 @@ def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512, buffer_2 = T.buffer_decl([1312], "uint8") buffer_3 = T.buffer_decl([2608], "uint8") # body - ethosu_write_2 = T.allocate([1536], "int8", "global", annotations={"disable_lower_builtin": True}) + ethosu_write_2_data = T.allocate([1536], "int8", "global", annotations={"disable_lower_builtin": True}) + ethosu_write_2 = T.buffer_decl([1536], "int8", data=ethosu_write_2_data) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, T.int8(-1), T.int8(-1), 12, buffer[0], 80, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[48], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -412,7 +414,8 @@ def main(placeholder_5: T.Buffer[(768,), "int8"], ethosu_write_1: T.Buffer[(640, buffer_2 = T.buffer_decl([320], "uint8") buffer_3 = T.buffer_decl([880], "uint8") # body - ethosu_write_2 = T.allocate([2560], "int8", "global", annotations={"disable_lower_builtin": True}) + ethosu_write_2_data = T.allocate([2560], "int8", "global", annotations={"disable_lower_builtin": True}) + ethosu_write_2 = T.buffer_decl([2560], "int8", data=ethosu_write_2_data) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 3, 8, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, T.int8(-1), T.int8(-1), 12, buffer_1[0], 80, T.int8(-1), T.int8(-1), 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 12, 16, 3, 12, 0, 16, placeholder_5[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 10, 8, 32, 10, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -433,7 +436,8 @@ def main(placeholder_5: T.Buffer[(1024,), "int8"], ethosu_write_1: T.Buffer[(204 buffer_2 = T.buffer_decl([272], "uint8") buffer_3 = T.buffer_decl([11040], "uint8") # body - ethosu_write_2 = T.allocate([2304], "int8", "global", annotations={"disable_lower_builtin": True}) + ethosu_write_2_data = T.allocate([2304], "int8", "global", annotations={"disable_lower_builtin": True}) + ethosu_write_2 = T.buffer_decl((2304,), "int8", data=ethosu_write_2_data) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, T.int8(-1), T.int8(-1), 12, buffer_2[0], 272, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[256], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -452,7 +456,8 @@ def main(placeholder: T.Buffer[(192,), "int8"], ethosu_write: T.Buffer[(8192,), buffer_2 = T.buffer_decl([304], "uint8") buffer_3 = T.buffer_decl([80], "uint8") # body - ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + ethosu_write_1_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + ethosu_write_1 = T.buffer_decl([4096], "int8", data=ethosu_write_1_data) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 32, 8, 16, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 304, T.int8(-1), T.int8(-1), 12, buffer_3[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[96], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle")) @@ -471,7 +476,8 @@ def main(placeholder: T.Buffer[(1024,), "int8"], ethosu_write: T.Buffer[(32768,) buffer_2 = T.buffer_decl([11040], "uint8") buffer_3 = T.buffer_decl([272], "uint8") # body - ethosu_write_1 = T.allocate([12288], "int8", "global", annotations={"disable_lower_builtin":True}) + ethosu_write_1_data = T.allocate([12288], "int8", "global", annotations={"disable_lower_builtin":True}) + ethosu_write_1 = T.buffer_decl([12288], "int8", data=ethosu_write_1_data) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 768, 16, 256, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 768, 16, 256, "int8", 32, 32, 26, 32, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 1024, 16, 512, 3, 3, 1, 1, 1, 1, buffer_2[0], 11040, T.int8(-1), T.int8(-1), 12, buffer_3[0], 272, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle")) __tvm_meta__ = None diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py index 6b97b38d80e6..8c7ff35272ef 100644 --- a/tests/python/contrib/test_ethosu/test_replace_copy.py +++ b/tests/python/contrib/test_ethosu/test_replace_copy.py @@ -36,7 +36,8 @@ def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(204 T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer_1 = T.buffer_decl([384], "uint8") # body - placeholder_global = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin": True}) + placeholder_global_data = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin": True}) + placeholder_global = T.buffer_decl([384], "uint8", data=placeholder_global_data) T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 384, placeholder_global[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_global[304], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -78,8 +79,10 @@ def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(409 buffer = T.buffer_decl([528], "uint8") buffer_2 = T.buffer_decl([336], "uint8") # body - placeholder_d_global = T.allocate([528], "uint8", "global", annotations={"disable_lower_builtin": True}) - placeholder_d_global_1 = T.allocate([336], "uint8", "global", annotations={"disable_lower_builtin": True}) + placeholder_d_global_data = T.allocate([528], "uint8", "global", annotations={"disable_lower_builtin": True}) + placeholder_d_global = T.buffer_decl([528], "uint8", data=placeholder_d_global_data) + placeholder_d_global_1_data = T.allocate([336], "uint8", "global", annotations={"disable_lower_builtin": True}) + placeholder_d_global_1 = T.buffer_decl([336], "uint8", data=placeholder_d_global_1_data) T.evaluate(T.call_extern("ethosu_copy", buffer[0], 528, placeholder_d_global[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 336, placeholder_d_global_1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_d_global[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global[416], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py index ba050de2b473..254abab644a2 100644 --- a/tests/python/contrib/test_ethosu/test_scheduler.py +++ b/tests/python/contrib/test_ethosu/test_scheduler.py @@ -184,10 +184,14 @@ def main(placeholder: T.Buffer[(301056,), "int8"], ethosu_write: T.Buffer[(75264 T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) buffer1 = T.buffer_decl([2848], "uint8") buffer3 = T.buffer_decl([976], "uint8") - p1 = T.allocate([2848], "uint8", "global", annotations={"disable_lower_builtin":True}) - p2 = T.allocate([976], "uint8", "global", annotations={"disable_lower_builtin":True}) - p5 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True}) - p6 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True}) + p1_data = T.allocate([2848], "uint8", "global", annotations={"disable_lower_builtin":True}) + p1 = T.buffer_decl([2848], "uint8", data=p1_data) + p2_data = T.allocate([976], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.buffer_decl([976], "uint8", data=p2_data) + p5_data = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True}) + p5 = T.buffer_decl([75264], "int8", data=p5_data) + p6_data = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True}) + p6 = T.buffer_decl([75264], "int8", data=p6_data) T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 2848, p1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 976, p2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p1[0], 2608, T.int8(-1), T.int8(-1), 12, p1[2608], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) diff --git a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py index e1a0e143281b..f8a84aa08367 100644 --- a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py +++ b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py @@ -56,8 +56,8 @@ def main(placeholder_6: T.Buffer[(192,), "int8"], ethosu_conv2d_1: T.Buffer[(512 placeholder_8 = T.buffer_decl([1], "uint8") placeholder_5 = T.buffer_decl([1], "uint8") # body - ethosu_conv2d_2 = T.allocate([1024], "uint8", "global") - ethosu_conv2d_3 = T.allocate([2048], "uint8", "global") + ethosu_conv2d_2 = T.decl_buffer([1024], "uint8") + ethosu_conv2d_3 = T.decl_buffer([2048], "uint8") T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, placeholder_6[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, placeholder_7[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_8[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="uint8")) T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "uint8", 4, 8, 8, 4, 0, 8, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_9[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_5[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8")) T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, placeholder_6[96], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, placeholder_7[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_8[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8")) @@ -76,8 +76,8 @@ def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_conv2d_1: T.Buffer[(20 placeholder_5 = T.buffer_decl([1], "int32") placeholder_4 = T.buffer_decl([1], "uint8") # body - placeholder_global = T.allocate([256], "uint8", "global") - placeholder_d_global = T.allocate([8], "int32", "global") + placeholder_global = T.decl_buffer([256], "uint8") + placeholder_d_global = T.decl_buffer([8], "int32") T.evaluate(T.call_extern("ethosu_copy", placeholder_4[0], 256, placeholder_global[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", placeholder_5[0], 8, placeholder_d_global[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "uint8", 16, 16, 8, 16, 0, 16, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -110,8 +110,10 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), buffer_6.name: buffer_6, buffer_7.name: buffer_7}}) # body - placeholder_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + placeholder_global_data = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) + placeholder_global = T.decl_buffer([128], "uint8", data=placeholder_global_data) + placeholder_d_global_data = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + placeholder_d_global = T.decl_buffer([32], "uint8", data=placeholder_d_global_data) T.evaluate(T.call_extern("ethosu_copy", buffer[0], 128, placeholder_global[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 32, placeholder_d_global[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 128, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) @@ -158,9 +160,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), buffer_8.name: buffer_8, buffer_9.name: buffer_9}}) # body - ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) - placeholder_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + ethosu_write_1_data = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + ethosu_write_1 = T.buffer_decl([4096], "int8", data=ethosu_write_1_data) + placeholder_global_data = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) + placeholder_global = T.buffer_decl([80], "uint8", data=placeholder_global_data) + placeholder_d_global_data = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + placeholder_d_global = T.buffer_decl([32], "uint8", data=placeholder_d_global_data) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 80, placeholder_global[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, placeholder_d_global[0], dtype="handle")) @@ -678,10 +683,10 @@ def main(placeholder_4: T.Buffer[(2048,), "int8"], ethosu_write_1: T.Buffer[(16, buffer_1.name: buffer_1, buffer_2.name: buffer_2}}) # body - placeholder_global = T.allocate([272], "uint8", "global") - placeholder_d_global = T.allocate([160], "uint8", "global") - ethosu_write_2 = T.allocate([16], "int16", "global") - placeholder_d_global_1 = T.allocate([1], "int16", "global") + placeholder_global = T.decl_buffer([272], "uint8") + placeholder_d_global = T.decl_buffer([160], "uint8") + ethosu_write_2 = T.decl_buffer([16], "int16") + placeholder_d_global_1 = T.decl_buffer([1], "int16") T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 272, placeholder_global[0], dtype="uint8")) T.evaluate(T.call_extern("ethosu_copy", buffer[0], 160, placeholder_d_global[0], dtype="uint8")) T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 16, 16, 8, 0, 16, placeholder_4[0], 0, 0, 0, T.float32(0.0039215548895299435), -128, "NHWC", 256, 16, 1, "int16", 1, 1, 16, 1, 0, 1, ethosu_write_2[0], 0, 0, 0, T.float32(0.0023205536417663097), -128, "NHWC", 1, 1, 1, 16, 8, 1, 1, 1, 1, placeholder_global[0], 272, 0, placeholder_d_global[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="int16")) diff --git a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py index 0b1e0f402b9d..e7632561c05c 100644 --- a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py +++ b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py @@ -63,9 +63,9 @@ def main(a: T.handle, b: T.handle) -> None: B = T.match_buffer(b, [14*14*512*256], dtype="float32") # body T.launch_thread(blockIdx_z, 196) - B_local = T.allocate([64], "float32", "local") - Apad_shared = T.allocate([512], "float32", "shared") - Apad_shared_local = T.allocate([8], "float32", "local") + B_local = T.decl_buffer([64], "float32", scope="local") + Apad_shared = T.decl_buffer([512], "float32", scope="shared") + Apad_shared_local = T.decl_buffer([8], "float32", scope="local") T.launch_thread(blockIdx_y, 8) T.launch_thread(blockIdx_x, 4) T.launch_thread(threadIdx_y, 8) @@ -105,9 +105,9 @@ def main(a: T.handle, b: T.handle) -> None: B = T.match_buffer(b, [14*14*512*256], dtype="float32") # body T.launch_thread(blockIdx_z, 196) - B_local = T.allocate([6400000], "float32", "local") - Apad_shared = T.allocate([512], "float32", "shared") - Apad_shared_local = T.allocate([8], "float32", "local") + B_local = T.decl_buffer([6400000], "float32", scope="local") + Apad_shared = T.decl_buffer([512], "float32", scope="shared") + Apad_shared_local = T.decl_buffer([8], "float32", scope="local") T.launch_thread(blockIdx_y, 8) T.launch_thread(blockIdx_x, 4) T.launch_thread(threadIdx_y, 8) @@ -151,9 +151,9 @@ def main(a: T.handle, b: T.handle) -> None: B = T.match_buffer(b, [14*14*512*256], dtype="float32") # body T.launch_thread(blockIdx_z, 196) - B_local = T.allocate([64], "float32", "local") - Apad_shared = T.allocate([512000], "float32", "shared") - Apad_shared_local = T.allocate([8], "float32", "local") + B_local = T.decl_buffer([64], "float32", scope="local") + Apad_shared = T.decl_buffer([512000], "float32", scope="shared") + Apad_shared_local = T.decl_buffer([8], "float32", scope="local") T.launch_thread(blockIdx_y, 8) T.launch_thread(blockIdx_x, 4) T.launch_thread(threadIdx_y, 8) @@ -197,9 +197,9 @@ def main(a: T.handle, b: T.handle) -> None: B = T.match_buffer(b, [14*14*512*256], dtype="float32") # body T.launch_thread(blockIdx_z, 196) - B_local = T.allocate([64], "float32", "local") - Apad_shared = T.allocate([512], "float32", "shared") - Apad_shared_local = T.allocate([8], "float32", "local") + B_local = T.decl_buffer([64], "float32", scope="local") + Apad_shared = T.decl_buffer([512], "float32", scope="shared") + Apad_shared_local = T.decl_buffer([8], "float32", scope="local") T.launch_thread(blockIdx_y, 8) T.launch_thread(blockIdx_x, 4) T.launch_thread(threadIdx_y, 8) diff --git a/tests/python/unittest/test_tir_analysis_calculate_workspace.py b/tests/python/unittest/test_tir_analysis_calculate_workspace.py index 1d78458b930d..12c892a04b07 100644 --- a/tests/python/unittest/test_tir_analysis_calculate_workspace.py +++ b/tests/python/unittest/test_tir_analysis_calculate_workspace.py @@ -31,8 +31,8 @@ def primfunc_global_allocates(placeholder_144: T.handle, placeholder_145: T.hand placeholder_149 = T.match_buffer(placeholder_146, [512], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_49 = T.match_buffer(T_cast_48, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_22 = T.allocate([131072], "int16", "global") - DepthwiseConv2d_9 = T.allocate([100352], "int32", "global") + PaddedInput_22 = T.decl_buffer([131072], "int16") + DepthwiseConv2d_9 = T.decl_buffer([100352], "int32") for i1_29, i2_39, i3_40 in T.grid(16, 16, 512): PaddedInput_22[(((i1_29*8192) + (i2_39*512)) + i3_40)] = T.if_then_else(((((1 <= i1_29) and (i1_29 < 15)) and (1 <= i2_39)) and (i2_39 < 15)), placeholder_147[((((i1_29*7168) + (i2_39*512)) + i3_40) - 7680)], T.int16(0), dtype="int16") for i_9, j_9, c_9 in T.grid(14, 14, 512): @@ -63,25 +63,25 @@ def primfunc_local_allocates(placeholder_162: T.handle, placeholder_163: T.handl T_cast_77 = T.match_buffer(T_cast_76, [100352], dtype="int16", elem_offset=0, align=64, offset_factor=1) sid_21 = T.allocate_const([0,1,2,3,4,5,6,7], "int8", [8]) # body - PaddedInput_25 = T.allocate([131072], "int16", "global") + PaddedInput_25 = T.decl_buffer([131072], "int16") for i1_35, i2_46, i3_47 in T.grid(16, 16, 512): PaddedInput_25[(((i1_35*8192) + (i2_46*512)) + i3_47)] = T.if_then_else(((((1 <= i1_35) and (i1_35 < 15)) and (1 <= i2_46)) and (i2_46 < 15)), placeholder_165[((((i1_35*7168) + (i2_46*512)) + i3_47) - 7680)], T.int16(0), dtype="int16") - T_add_11 = T.allocate([100352], "int32", "global") - with T.allocate([100352], "int32", "global") as DepthwiseConv2d_11: + T_add_11 = T.decl_buffer([100352], "int32") + with T.decl_buffer([100352], "int32") as DepthwiseConv2d_11: for i_11, j_11, c_11 in T.grid(14, 14, 512): DepthwiseConv2d_11[(((i_11*7168) + (j_11*512)) + c_11)] = 0 for di_11, dj_11 in T.grid(3, 3): DepthwiseConv2d_11[(((i_11*7168) + (j_11*512)) + c_11)] = (DepthwiseConv2d_11[(((i_11*7168) + (j_11*512)) + c_11)] + (PaddedInput_25[(((((i_11*8192) + (di_11*8192)) + (j_11*512)) + (dj_11*512)) + c_11)].astype("int32")*placeholder_166[(((di_11*1536) + (dj_11*512)) + c_11)].astype("int32"))) for ax1_44, ax2_45, ax3_47 in T.grid(14, 14, 512): T_add_11[(((ax1_44*7168) + (ax2_45*512)) + ax3_47)] = (DepthwiseConv2d_11[(((ax1_44*7168) + (ax2_45*512)) + ax3_47)] + placeholder_167[ax3_47]) - compute_22 = T.allocate([100352], "int32", "global") - with T.allocate([100352], "int32", "global") as T_cast_78: + compute_22 = T.decl_buffer([100352], "int32") + with T.decl_buffer([100352], "int32") as T_cast_78: for ax1_45, ax2_46, ax3_48 in T.grid(14, 14, 512): T_cast_78[(((ax1_45*7168) + (ax2_46*512)) + ax3_48)] = T_add_11[(((ax1_45*7168) + (ax2_46*512)) + ax3_48)] for i1_36, i2_47, i3_48 in T.grid(14, 14, 512): compute_22[(((i1_36*7168) + (i2_47*512)) + i3_48)] = T.q_multiply_shift(T_cast_78[(((i1_36*7168) + (i2_47*512)) + i3_48)], 1948805937, 31, -5, dtype="int32") - T_cast_79 = T.allocate([100352], "uint8", "global") - with T.allocate([100352], "int32", "global") as compute_23: + T_cast_79 = T.decl_buffer([100352], "uint8") + with T.decl_buffer([100352], "int32") as compute_23: for i1_37, i2_48, i3_49 in T.grid(14, 14, 512): compute_23[(((i1_37*7168) + (i2_48*512)) + i3_49)] = T.max(T.max(compute_22[(((i1_37*7168) + (i2_48*512)) + i3_49)], 255), 0) for ax1_46, ax2_47, ax3_49 in T.grid(14, 14, 512): diff --git a/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py b/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py index 49121614ffa0..344f37a23677 100644 --- a/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py +++ b/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py @@ -52,7 +52,7 @@ def buffer_opaque_access(b: T.handle, c: T.handle) -> None: with T.block(): T.reads([]) T.writes(B[0:16, 0:16]) - A = T.allocate([256], "float32", "global") + A = T.decl_buffer([256], "float32") for i, j in T.grid(16, 16): A[i * 16 + j] = 1 for i in range(0, 16): diff --git a/tests/python/unittest/test_tir_ptx_mma.py b/tests/python/unittest/test_tir_ptx_mma.py index 23405fdee98a..bee9b7b48020 100644 --- a/tests/python/unittest/test_tir_ptx_mma.py +++ b/tests/python/unittest/test_tir_ptx_mma.py @@ -36,9 +36,9 @@ def gemm_mma_m8n8k4_row_col_fp64pf64fp64(a: T.handle, b: T.handle, c: T.handle): T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([1], "float64", scope="local") - MultiB = T.allocate([1], "float64", scope="local") - Accum = T.allocate([2], "float64", scope="local") + MultiA = T.decl_buffer([1], "float64", scope="local") + MultiB = T.decl_buffer([1], "float64", scope="local") + Accum = T.decl_buffer([2], "float64", scope="local") for i in range(2): Accum[i] = T.float64(0) @@ -106,9 +106,9 @@ def gemm_mma_m8n8k4_row_row_fp16fp16fp16(a: T.handle, b: T.handle, c: T.handle): T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([4], "float16", scope="local") - MultiB = T.allocate([4], "float16", scope="local") - Accum = T.allocate([8], "float16", scope="local") + MultiA = T.decl_buffer([4], "float16", scope="local") + MultiB = T.decl_buffer([4], "float16", scope="local") + Accum = T.decl_buffer([8], "float16", scope="local") for i in range(8): Accum[i] = T.float32(0) @@ -187,9 +187,10 @@ def gemm_mma_m8n8k4_row_row_fp16fp16fp32(a: T.handle, b: T.handle, c: T.handle): T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([4], "float16", scope="local") - MultiB = T.allocate([4], "float16", scope="local") - Accum = T.allocate([8], "float32", scope="local") + MultiA = T.decl_buffer([4], "float16", scope="local") + MultiB = T.decl_buffer([4], "float16", scope="local") + Accum = T.decl_buffer([8], "float32", scope="local") + for i in range(8): Accum[i] = T.float32(0) @@ -274,9 +275,9 @@ def gemm_mma_m8n8k16_row_col_s8s8s32(a: T.handle, b: T.handle, c: T.handle): T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([4], "int8", scope="local") - MultiB = T.allocate([4], "int8", scope="local") - Accum = T.allocate([2], "int32", scope="local") + MultiA = T.decl_buffer([4], "int8", scope="local") + MultiB = T.decl_buffer([4], "int8", scope="local") + Accum = T.decl_buffer([2], "int32", scope="local") for i in range(2): Accum[i] = T.int32(0) @@ -350,9 +351,9 @@ def gemm_mma_m8n8k16_row_col_s8u8s32(a: T.handle, b: T.handle, c: T.handle): T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([4], "int8", scope="local") - MultiB = T.allocate([4], "uint8", scope="local") - Accum = T.allocate([2], "int32", scope="local") + MultiA = T.decl_buffer([4], "int8", scope="local") + MultiB = T.decl_buffer([4], "uint8", scope="local") + Accum = T.decl_buffer([2], "int32", scope="local") for i in range(2): Accum[i] = T.int32(0) @@ -426,9 +427,9 @@ def gemm_mma_m8n8k32_row_col_s4s4s32(a: T.handle, b: T.handle, c: T.handle): T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([8], "int4", scope="local") - MultiB = T.allocate([8], "int4", scope="local") - Accum = T.allocate([2], "int32", scope="local") + MultiA = T.decl_buffer([8], "int4", scope="local") + MultiB = T.decl_buffer([8], "int4", scope="local") + Accum = T.decl_buffer([2], "int32", scope="local") for i in range(2): Accum[i] = T.int32(0) @@ -494,9 +495,9 @@ def gemm_mma_m8n8k32_row_col_s4u4s32(a: T.handle, b: T.handle, c: T.handle): T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([8], "int4", scope="local") - MultiB = T.allocate([8], "uint4", scope="local") - Accum = T.allocate([2], "int32", scope="local") + MultiA = T.decl_buffer([8], "int4", scope="local") + MultiB = T.decl_buffer([8], "uint4", scope="local") + Accum = T.decl_buffer([2], "int32", scope="local") for i in range(2): Accum[i] = T.int32(0) @@ -562,9 +563,9 @@ def gemm_mma_m16n8k8_row_col_fp16fp16fp32(a: T.handle, b: T.handle, c: T.handle) T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([4], "float16", scope="local") - MultiB = T.allocate([2], "float16", scope="local") - Accum = T.allocate([4], "float32", scope="local") + MultiA = T.decl_buffer([4], "float16", scope="local") + MultiB = T.decl_buffer([2], "float16", scope="local") + Accum = T.decl_buffer([4], "float32", scope="local") for i in range(4): Accum[i] = T.float32(0) @@ -640,9 +641,9 @@ def gemm_mma_m16n8k16_row_col_fp16fp16fp16(a: T.handle, b: T.handle, c: T.handle T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([8], "float16", scope="local") - MultiB = T.allocate([4], "float16", scope="local") - Accum = T.allocate([4], "float16", scope="local") + MultiA = T.decl_buffer([8], "float16", scope="local") + MultiB = T.decl_buffer([4], "float16", scope="local") + Accum = T.decl_buffer([4], "float16", scope="local") for i in range(4): Accum[i] = T.float32(0) @@ -722,9 +723,9 @@ def gemm_mma_m16n8k16_row_col_fp16fp16fp32(a: T.handle, b: T.handle, c: T.handle T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([8], "float16", scope="local") - MultiB = T.allocate([4], "float16", scope="local") - Accum = T.allocate([4], "float32", scope="local") + MultiA = T.decl_buffer([8], "float16", scope="local") + MultiB = T.decl_buffer([4], "float16", scope="local") + Accum = T.decl_buffer([4], "float32", scope="local") for i in range(4): Accum[i] = T.float32(0) @@ -804,9 +805,9 @@ def gemm_mma_m16n8k16_row_col_s8s8s32(a: T.handle, b: T.handle, c: T.handle): T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([8], "int8", scope="local") - MultiB = T.allocate([4], "int8", scope="local") - Accum = T.allocate([4], "int32", scope="local") + MultiA = T.decl_buffer([8], "int8", scope="local") + MultiB = T.decl_buffer([4], "int8", scope="local") + Accum = T.decl_buffer([4], "int32", scope="local") for i in range(4): Accum[i] = T.int32(0) @@ -886,9 +887,9 @@ def gemm_mma_m16n8k16_row_col_s8u8s32(a: T.handle, b: T.handle, c: T.handle): T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([8], "int8", scope="local") - MultiB = T.allocate([4], "uint8", scope="local") - Accum = T.allocate([4], "int32", scope="local") + MultiA = T.decl_buffer([8], "int8", scope="local") + MultiB = T.decl_buffer([4], "uint8", scope="local") + Accum = T.decl_buffer([4], "int32", scope="local") for i in range(4): Accum[i] = T.int32(0) @@ -968,9 +969,9 @@ def gemm_mma_m16n8k32_row_col_s8s8s32(a: T.handle, b: T.handle, c: T.handle): T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([16], "int8", scope="local") - MultiB = T.allocate([8], "int8", scope="local") - Accum = T.allocate([4], "int32", scope="local") + MultiA = T.decl_buffer([16], "int8", scope="local") + MultiB = T.decl_buffer([8], "int8", scope="local") + Accum = T.decl_buffer([4], "int32", scope="local") for i in range(4): Accum[i] = T.int32(0) @@ -1050,9 +1051,9 @@ def gemm_mma_m16n8k32_row_col_s8u8s32(a: T.handle, b: T.handle, c: T.handle): T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([16], "int8", scope="local") - MultiB = T.allocate([8], "uint8", scope="local") - Accum = T.allocate([4], "int32", scope="local") + MultiA = T.decl_buffer([16], "int8", scope="local") + MultiB = T.decl_buffer([8], "uint8", scope="local") + Accum = T.decl_buffer([4], "int32", scope="local") for i in range(4): Accum[i] = T.int32(0) @@ -1132,9 +1133,9 @@ def gemm_mma_m16n8k64_row_col_s4s4s32(a: T.handle, b: T.handle, c: T.handle): T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([32], "int4", scope="local") - MultiB = T.allocate([16], "int4", scope="local") - Accum = T.allocate([4], "int32", scope="local") + MultiA = T.decl_buffer([32], "int4", scope="local") + MultiB = T.decl_buffer([16], "int4", scope="local") + Accum = T.decl_buffer([4], "int32", scope="local") for i in range(4): Accum[i] = T.int32(0) @@ -1206,9 +1207,9 @@ def gemm_mma_m16n8k64_row_col_s4u4s32(a: T.handle, b: T.handle, c: T.handle): T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([32], "int4", scope="local") - MultiB = T.allocate([16], "uint4", scope="local") - Accum = T.allocate([4], "int32", scope="local") + MultiA = T.decl_buffer([32], "int4", scope="local") + MultiB = T.decl_buffer([16], "uint4", scope="local") + Accum = T.decl_buffer([4], "int32", scope="local") for i in range(4): Accum[i] = T.int32(0) @@ -1280,9 +1281,9 @@ def gemm_mma_m16n8k256_row_col_b1b1s32(a: T.handle, b: T.handle, c: T.handle): T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - MultiA = T.allocate([128], "int1", scope="local") - MultiB = T.allocate([64], "int1", scope="local") - Accum = T.allocate([4], "int32", scope="local") + MultiA = T.decl_buffer([128], "int1", scope="local") + MultiB = T.decl_buffer([64], "int1", scope="local") + Accum = T.decl_buffer([4], "int32", scope="local") for i in range(4): Accum[i] = T.int32(0) diff --git a/tests/python/unittest/test_tir_ptx_mma_sp.py b/tests/python/unittest/test_tir_ptx_mma_sp.py index 321cd28ff6f7..24170b4898f9 100644 --- a/tests/python/unittest/test_tir_ptx_mma_sp.py +++ b/tests/python/unittest/test_tir_ptx_mma_sp.py @@ -52,10 +52,10 @@ def mma_sp_m16n8k16_f16f16f16(a: T.handle, b: T.handle, c: T.handle, _metadata: T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - multi_a = T.allocate([4], "float16", scope="local") - multi_b = T.allocate([4], "float16", scope="local") - accum = T.allocate([4], "float16", scope="local") - meta_local = T.allocate([1], "uint32", scope="local") + multi_a = T.decl_buffer([4], "float16", scope="local") + multi_b = T.decl_buffer([4], "float16", scope="local") + accum = T.decl_buffer([4], "float16", scope="local") + meta_local = T.decl_buffer([1], "uint32", scope="local") for i in range(4): accum[i] = T.float16(0) @@ -106,10 +106,10 @@ def mma_sp_m16n8k16_f16f16f32(a: T.handle, b: T.handle, c: T.handle, _metadata: T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - multi_a = T.allocate([4], "float16", scope="local") - multi_b = T.allocate([4], "float16", scope="local") - accum = T.allocate([4], "float32", scope="local") - meta_local = T.allocate([1], "uint32", scope="local") + multi_a = T.decl_buffer([4], "float16", scope="local") + multi_b = T.decl_buffer([4], "float16", scope="local") + accum = T.decl_buffer([4], "float32", scope="local") + meta_local = T.decl_buffer([1], "uint32", scope="local") for i in range(4): accum[i] = T.float16(0) @@ -160,10 +160,10 @@ def mma_sp_m16n8k32_f16f16f16(a: T.handle, b: T.handle, c: T.handle, _metadata: T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - multi_a = T.allocate([8], "float16", scope="local") - multi_b = T.allocate([8], "float16", scope="local") - accum = T.allocate([4], "float16", scope="local") - meta_local = T.allocate([1], "uint32", scope="local") + multi_a = T.decl_buffer([8], "float16", scope="local") + multi_b = T.decl_buffer([8], "float16", scope="local") + accum = T.decl_buffer([4], "float16", scope="local") + meta_local = T.decl_buffer([1], "uint32", scope="local") for i in range(4): accum[i] = T.float16(0) @@ -214,10 +214,10 @@ def mma_sp_m16n8k32_f16f16f32(a: T.handle, b: T.handle, c: T.handle, _metadata: T.launch_thread(brow, 1) T.launch_thread(bcol, 1) T.launch_thread(tx, 32) - multi_a = T.allocate([8], "float16", scope="local") - multi_b = T.allocate([8], "float16", scope="local") - accum = T.allocate([4], "float32", scope="local") - meta_local = T.allocate([1], "uint32", scope="local") + multi_a = T.decl_buffer([8], "float16", scope="local") + multi_b = T.decl_buffer([8], "float16", scope="local") + accum = T.decl_buffer([4], "float32", scope="local") + meta_local = T.decl_buffer([1], "uint32", scope="local") for i in range(4): accum[i] = T.float16(0) diff --git a/tests/python/unittest/test_tir_renew_defs.py b/tests/python/unittest/test_tir_renew_defs.py index 36cc52c16935..28b440a608dc 100644 --- a/tests/python/unittest/test_tir_renew_defs.py +++ b/tests/python/unittest/test_tir_renew_defs.py @@ -135,7 +135,8 @@ def test_undefined_buffer(): @T.prim_func def access_alloc(): # Buffer A should be remapped - A = T.allocate([128], "float16", "global") + A_data = T.allocate([128], "float16", "global") + A = T.buffer_decl(shape=[128], dtype="float16", data=A_data) # check if buffer var also get remapped T.evaluate(A.data) for i in range(128): diff --git a/tests/python/unittest/test_tir_structural_equal_hash.py b/tests/python/unittest/test_tir_structural_equal_hash.py index d5feb21f0db7..4bb13ed77ad8 100644 --- a/tests/python/unittest/test_tir_structural_equal_hash.py +++ b/tests/python/unittest/test_tir_structural_equal_hash.py @@ -234,7 +234,7 @@ def test_buffer_storage_scope(): buffer_local_0 = tvm.tir.decl_buffer((10, 10), "float32", scope="local") buffer_local_1 = tvm.tir.decl_buffer((10, 10), "float32", scope="local") - buffer_global = tvm.tir.decl_buffer((10, 10), "float32", scope="global") + buffer_global = tvm.tir.decl_buffer((10, 10), "float32") buffer_empty = tvm.tir.decl_buffer((10, 10), "float32", scope="") func0 = tvm.tir.PrimFunc([x], tvm.tir.Evaluate(x), buffer_map={x: buffer_local_0}) diff --git a/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py b/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py index 1a3afdd4c1e2..e08f04fa1f25 100644 --- a/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py +++ b/tests/python/unittest/test_tir_transform_convert_for_loops_serial.py @@ -31,13 +31,13 @@ def fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(placeholder_30: T. placeholder_35 = T.match_buffer(placeholder_32, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_9 = T.match_buffer(T_cast_8, [12544], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_3 = T.allocate([150528], "int16", "global") + PaddedInput_3 = T.decl_buffer([150528], "int16") for i0_i1_fused_3 in T.parallel(0, 28): for i2_3, i3_3 in T.grid(28, 192): PaddedInput_3[(((i0_i1_fused_3*5376) + (i2_3*192)) + i3_3) ] = placeholder_33[(((i0_i1_fused_3*5376) + (i2_3*192)) + i3_3)] for ax0_ax1_fused_ax2_fused_3 in T.parallel(0, 784): for ax3_2 in T.serial(0, 16): - Conv2dOutput_3 = T.allocate([1], "int32", "global") + Conv2dOutput_3 = T.decl_buffer([1], "int32") Conv2dOutput_3[0] = 0 for rc_3 in T.serial(0, 192): Conv2dOutput_3[0] = (Conv2dOutput_3[0] + (T.cast(PaddedInput_3[((ax0_ax1_fused_ax2_fused_3*192) + rc_3)], "int32")*T.cast(placeholder_34[((rc_3*16) + ax3_2)], "int32"))) diff --git a/tests/python/unittest/test_tir_transform_extract_constants.py b/tests/python/unittest/test_tir_transform_extract_constants.py index 82f4f6515c09..5de06e38a557 100644 --- a/tests/python/unittest/test_tir_transform_extract_constants.py +++ b/tests/python/unittest/test_tir_transform_extract_constants.py @@ -27,7 +27,8 @@ class Module4: def constant1(a: T.handle) -> None: A = T.match_buffer(a, (10), "int32") B = T.alloc_buffer((10), "int32") - K = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10]) + K_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10]) + K = T.buffer_decl(shape=(10), dtype="int32", data=K_data) for x in T.serial(0, 10): B[x] = A[x] + K[x] @@ -35,7 +36,8 @@ def constant1(a: T.handle) -> None: def constant2(a: T.handle) -> None: A = T.match_buffer(a, (10), "int32") B = T.alloc_buffer((10), "int32") - K = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10]) + K_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10]) + K = T.buffer_decl(shape=(10), dtype="int32", data=K_data) for x in T.serial(0, 10): B[x] = A[x] + K[x] @@ -43,7 +45,8 @@ def constant2(a: T.handle) -> None: def constant3(a: T.handle) -> None: A = T.match_buffer(a, (10), "int32") B = T.alloc_buffer((10), "int32") - K = T.allocate_const([1, 2, 3, 1, 1, 1, 1, 1, 1, 1], "int32", [10]) + K_data = T.allocate_const([1, 2, 3, 1, 1, 1, 1, 1, 1, 1], "int32", [10]) + K = T.buffer_decl(shape=(10), dtype="int32", data=K_data) for x in T.serial(0, 10): B[x] = A[x] + K[x] diff --git a/tests/python/unittest/test_tir_transform_flatten_buffer.py b/tests/python/unittest/test_tir_transform_flatten_buffer.py index a1195a9d2a65..4cdf71889eee 100644 --- a/tests/python/unittest/test_tir_transform_flatten_buffer.py +++ b/tests/python/unittest/test_tir_transform_flatten_buffer.py @@ -33,7 +33,8 @@ def elementwise_func(a: T.handle, c: T.handle) -> None: A = T.match_buffer(a, (16, 16), "float32") C = T.match_buffer(c, (16, 16), "float32") for i in T.serial(0, 16): - B_new = T.allocate([1, 16], "float32", "global") + B_new_data = T.allocate([1, 16], "float32", "global") + B_new = T.buffer_decl(shape=[1, 16], dtype="float32", data=B_new_data) for j in T.serial(0, 16): B_new[0, j] = A[i, j] + 1.0 for j in T.serial(0, 16): @@ -47,7 +48,8 @@ def flattened_elementwise_func(a: T.handle, c: T.handle) -> None: T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data) T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data) for i in T.serial(0, 16): - B_new = T.allocate([16], "float32", "global") + B_new_data = T.allocate([16], "float32", "global") + B_new = T.buffer_decl(shape=[16], dtype="float32", data=B_new_data) for j in T.serial(0, 16): B_new[j] = A[((i * 16) + j)] + 1.0 for j in T.serial(0, 16): @@ -66,7 +68,8 @@ def gpu_func(a: T.handle, c: T.handle) -> None: T.launch_thread(i0, 4) T.launch_thread(i1, 2) T.launch_thread(i2, 2) - B = T.allocate([1, 16], "float32", "local") + B_data = T.allocate([1, 16], "float32", "local") + B = T.buffer_decl(shape=[1, 16], dtype="float32", data=B_data, scope="local") for j in range(0, 16): B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0 for j in range(0, 16): @@ -87,7 +90,8 @@ def flattened_gpu_func(a: T.handle, c: T.handle) -> None: T.launch_thread(i0, 4) T.launch_thread(i1, 2) T.launch_thread(i2, 2) - B = T.allocate([16], "float32", "local") + B_data = T.allocate([16], "float32", "local") + B = T.buffer_decl(shape=[16], dtype="float32", data=B_data, scope="local") for j in range(0, 16): B[j] = A[i0 * 64 + i1 * 32 + i2 * 16 + j] + 1.0 for j in range(0, 16): @@ -100,7 +104,8 @@ def symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> None: C = T.match_buffer(c, (n, m), "float32") for i in range(0, n): - B = T.allocate([m], "float32", "global") + B_data = T.allocate([m], "float32", "global") + B = T.buffer_decl(shape=[m], dtype="float32", data=B_data) for j in range(0, m): B[j] = A[i, j] + 1.0 for j in range(0, m): @@ -115,7 +120,8 @@ def flattened_symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> T.preflattened_buffer(C, (n, m), "float32", data=C.data) for i in range(0, n): - B = T.allocate([m], "float32", "global") + B_data = T.allocate([m], "float32", "global") + B = T.buffer_decl(shape=[m], dtype="float32", data=B_data) for j in range(0, m): B[j] = A[i * m + j] + 1.0 for j in range(0, m): @@ -128,8 +134,10 @@ def multi_alloc_func(a: T.handle, d: T.handle) -> None: D = T.match_buffer(d, (4, 32), "float32") for i, j in T.grid(4, 32): - B = T.allocate((4, 32), "float32", scope="global") - C = T.allocate((4, 32), "float32", scope="global") + B_data = T.allocate((4, 32), "float32", scope="global") + B = T.buffer_decl(shape=(4, 32), dtype="float32", data=B_data) + C_data = T.allocate((4, 32), "float32", scope="global") + C = T.buffer_decl(shape=(4, 32), dtype="float32", data=C_data) B[i, j] = A[i, j] + 1.0 C[i, j] = A[i, j] + B[i, j] D[i, j] = C[i, j] * 2.0 @@ -143,8 +151,10 @@ def flattened_multi_alloc_func(a: T.handle, d: T.handle) -> None: T.preflattened_buffer(D, (4, 32), "float32", data=D.data) for i, j in T.grid(4, 32): - B = T.allocate([128], "float32", "global") - C = T.allocate([128], "float32", "global") + B_data = T.allocate([128], "float32", "global") + B = T.buffer_decl(shape=[128], dtype="float32", data=B_data) + C_data = T.allocate([128], "float32", "global") + C = T.buffer_decl(shape=[128], dtype="float32", data=C_data) B[i * 32 + j] = A[i * 32 + j] + 1.0 C[i * 32 + j] = A[i * 32 + j] + B[i * 32 + j] D[i * 32 + j] = C[i * 32 + j] * 2.0 @@ -155,7 +165,8 @@ def strided_buffer_func(a: T.handle, c: T.handle) -> None: A = T.match_buffer(a, (16, 16), "float32") C = T.match_buffer(c, (16, 16), "float32") for i0 in T.serial(4): - B = T.allocate([4, 17], "float32", "global") + B_data = T.allocate([4, 17], "float32", "global") + B = T.buffer_decl(shape=[4, 17], dtype="float32", data=B_data) B_1 = T.buffer_decl([4, 16], dtype="float32", data=B.data, strides=[17, 1]) for i1, j in T.grid(4, 16): B_1[i1, j] = A[i0 * 4 + i1, j] + 1.0 @@ -170,7 +181,8 @@ def flattened_strided_buffer_func(a: T.handle, c: T.handle) -> None: T.preflattened_buffer(A, [16, 16], dtype="float32", data=A.data) T.preflattened_buffer(C, [16, 16], dtype="float32", data=C.data) for i0 in T.serial(0, 4): - B_new = T.allocate([68], "float32", "global") + B_new_data = T.allocate([68], "float32", "global") + B_new = T.buffer_decl(shape=[68], dtype="float32", data=B_new_data) for i1 in T.serial(0, 4): for j in T.serial(0, 16): B_new[i1 * 17 + j] = A[i0 * 64 + i1 * 16 + j] + 1.0 diff --git a/tests/python/unittest/test_tir_transform_inject_virtual_thread.py b/tests/python/unittest/test_tir_transform_inject_virtual_thread.py index b96afb6a0941..548f3bc8d1d2 100644 --- a/tests/python/unittest/test_tir_transform_inject_virtual_thread.py +++ b/tests/python/unittest/test_tir_transform_inject_virtual_thread.py @@ -145,12 +145,14 @@ def test_vthread_simplified(): def before_func(): vthread = T.env_thread("vthread") T.launch_thread(vthread, 4) - B = T.allocate([4], "int32", "shared") + B_data = T.allocate([4], "int32", scope="shared") + B = T.buffer_decl([4], "int32", data=B_data, scope="shared") B[0:4] = T.broadcast(vthread, 4) @T.prim_func def expected_func(): - B = T.allocate([16], "int32", "shared") + B_data = T.allocate([16], "int32", scope="shared") + B = T.buffer_decl([16], "int32", data=B_data, scope="shared") # The indices for B should each be a single Ramp node, and # should not be the sum of a Ramp and Broadcast node. B[0 * 4 : 0 * 4 + 4] = T.broadcast(0, 4) @@ -172,12 +174,14 @@ def test_vthread_vectorized(): def before_func(): vthread = T.env_thread("vthread") T.launch_thread(vthread, 4) - B = T.allocate([4], "int32", "shared") + B_data = T.allocate([4], "int32", "shared") + B = T.buffer_decl([4], "int32", data=B_data, scope="shared") B[0:4] = T.broadcast(vthread, 4) @T.prim_func def expected_func(): - B = T.allocate([4], "int32x4", "shared") + B_data = T.allocate([4], "int32x4", "shared") + B = T.buffer_decl([4], "int32x4", data=B_data, scope="shared") B[0 * 4 / 4] = T.broadcast(0, 4) B[1 * 4 / 4] = T.broadcast(1, 4) B[2 * 4 / 4] = T.broadcast(2, 4) diff --git a/tests/python/unittest/test_tir_transform_lower_opaque_block.py b/tests/python/unittest/test_tir_transform_lower_opaque_block.py index 6f557ba09d43..f8f3e3a5aced 100644 --- a/tests/python/unittest/test_tir_transform_lower_opaque_block.py +++ b/tests/python/unittest/test_tir_transform_lower_opaque_block.py @@ -54,7 +54,8 @@ def transformed_elementwise_func(a: T.handle, c: T.handle) -> None: A = T.match_buffer(a, (16, 16), "float32") C = T.match_buffer(c, (16, 16), "float32") for i in T.serial(0, 16): - B_new = T.allocate([1, 16], "float32", "global") + B_new_data = T.allocate([1, 16], "float32", "global") + B_new = T.buffer_decl(shape=[1, 16], dtype="float32", data=B_new_data) for j in T.serial(0, 16): B_new[0, j] = A[i, j] + 1.0 for j in T.serial(0, 16): @@ -96,7 +97,8 @@ def transformed_gpu_func(a: T.handle, c: T.handle) -> None: T.launch_thread(i0, 4) T.launch_thread(i1, 2) T.launch_thread(i2, 2) - B = T.allocate([1, 16], "float32", "local") + B_data = T.allocate([1, 16], "float32", "local") + B = T.buffer_decl(shape=[1, 16], dtype="float32", scope="local", data=B_data) for j in range(0, 16): B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0 for j in range(0, 16): @@ -131,7 +133,8 @@ def transformed_symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32) C = T.match_buffer(c, (n, m), "float32") for i in range(0, n): - B = T.allocate([m], "float32", "global") + B_data = T.allocate([m], "float32", "global") + B = T.buffer_decl(shape=[m], dtype="float32", data=B_data) for j in range(0, m): B[j] = A[i, j] + 1.0 for j in range(0, m): @@ -204,8 +207,10 @@ def transformed_multi_alloc_func(a: T.handle, d: T.handle) -> None: D = T.match_buffer(d, (32), "float32") for i in range(0, 32): - B = T.allocate((32,), "float32", "global") - C = T.allocate((32,), "float32", "global") + B_data = T.allocate((32,), "float32", "global") + B = T.buffer_decl(shape=(32,), dtype="float32", data=B_data) + C_data = T.allocate((32,), "float32", "global") + C = T.buffer_decl(shape=(32,), dtype="float32", data=C_data) B[i] = A[i] + 1.0 C[i] = A[i] + B[i] D[i] = C[i] * 2.0 @@ -240,7 +245,8 @@ def transformed_strided_buffer_func( ) -> None: # body for i0 in T.serial(4): - B = T.allocate([4, 17], "float32", "global") + B_data = T.allocate([4, 17], "float32", "global") + B = T.buffer_decl(shape=[4, 17], dtype="float32", data=B_data) B_1 = T.buffer_decl([4, 16], dtype="float32", data=B.data, strides=[17, 1]) for i1, j in T.grid(4, 16): B_1[i1, j] = A[i0 * 4 + i1, j] + T.float32(1) diff --git a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py index fd08f7e2249a..bfa132d4cecf 100644 --- a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py +++ b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py @@ -36,9 +36,9 @@ def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "flo blockIdx_x = T.env_thread("blockIdx.x") # body T.launch_thread(blockIdx_x, 64) - conv2d_transpose_nhwc_local = T.allocate([8], "float32", "local") - PadInput_shared = T.allocate([768], "float32", "shared") - weight_shared = T.allocate([4096], "float32", "shared") + conv2d_transpose_nhwc_local = T.decl_buffer([8], "float32", scope="local") + PadInput_shared = T.decl_buffer([768], "float32", scope="shared") + weight_shared = T.decl_buffer([4096], "float32", scope="shared") T.launch_thread(threadIdx_x, 32) for i2_3_init, i1_4_init, i2_4_init in T.grid(2, 2, 2): conv2d_transpose_nhwc_local[i1_4_init * 4 + i2_3_init * 2 + i2_4_init] = T.float32(0) @@ -67,9 +67,9 @@ def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "flo blockIdx_x = T.env_thread("blockIdx.x") # body T.launch_thread(blockIdx_x, 64) - conv2d_transpose_nhwc_local = T.allocate([8], "float32", "local") - PadInput_shared = T.allocate([768], "float32", "shared") - weight_shared = T.allocate([4096], "float32", "shared") + conv2d_transpose_nhwc_local = T.decl_buffer([8], "float32", scope="local") + PadInput_shared = T.decl_buffer([768], "float32", scope="shared") + weight_shared = T.decl_buffer([4096], "float32", scope="shared") T.launch_thread(threadIdx_x, 32) for i2_3_init, i1_4_init, i2_4_init in T.grid(2, 2, 2): conv2d_transpose_nhwc_local[i1_4_init * 4 + i2_3_init * 2 + i2_4_init] = T.float32(0) @@ -98,9 +98,9 @@ def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "flo T.preflattened_buffer(conv2d_transpose_nhwc, [1, 8, 8, 256], dtype="float32", data=conv2d_transpose_nhwc.data) # body T.launch_thread(blockIdx_x, 64) - conv2d_transpose_nhwc_local = T.allocate([8], "float32", "local") - PadInput_shared = T.allocate([768], "float32", "shared") - weight_shared = T.allocate([4096], "float32", "shared") + conv2d_transpose_nhwc_local = T.decl_buffer([8], "float32", scope="local") + PadInput_shared = T.decl_buffer([768], "float32", scope="shared") + weight_shared = T.decl_buffer([4096], "float32", scope="shared") T.launch_thread(threadIdx_x, 32) for i2_3_init, i1_4_init, i2_4_init in T.grid(2, 2, 2): conv2d_transpose_nhwc_local[i1_4_init * 4 + i2_3_init * 2 + i2_4_init] = T.float32(0) diff --git a/tests/python/unittest/test_tir_transform_storage_flatten.py b/tests/python/unittest/test_tir_transform_storage_flatten.py index ff59f10c0168..95e2eaed55fa 100644 --- a/tests/python/unittest/test_tir_transform_storage_flatten.py +++ b/tests/python/unittest/test_tir_transform_storage_flatten.py @@ -95,7 +95,7 @@ def main(A_param: T.handle, C_param: T.handle): threadIdx_x = T.env_thread("threadIdx.x") T.launch_thread(threadIdx_x, 1) for i in T.serial(0, 100): - B = T.allocate([4], "float32", scope="shared") + B = T.decl_buffer([4], "float32", scope="shared") with T.attr(B.data, "double_buffer_scope", 1): for j in T.serial(0, 4): B[j] = A[4 * i + j] @@ -142,7 +142,7 @@ def main(): A_data: T.Ptr[T.int32] = T.call_extern("dummy_extern_function", dtype="handle") # and a buffer is backed by that pointer, - A = T.buffer_decl([1], dtype="float32", data=A_data) + A = T.decl_buffer([1], dtype="float32", data=A_data) T.evaluate(A[0]) # then the call to StorageFlatten would result in an exception diff --git a/tests/python/unittest/test_tir_transform_storage_rewrite.py b/tests/python/unittest/test_tir_transform_storage_rewrite.py index b7cb75594997..581afef88942 100644 --- a/tests/python/unittest/test_tir_transform_storage_rewrite.py +++ b/tests/python/unittest/test_tir_transform_storage_rewrite.py @@ -654,14 +654,16 @@ def test_access_in_let_value(): @T.prim_func def func(A: T.Buffer[(8,), "float32"]): for i in range(8): - B = T.allocate((1,), "float32", "global") + B_data = T.allocate((1,), "float32", "global") + B = T.buffer_decl(shape=[1], dtype="float32", data=B_data) B[0] = 3.14 x: T.float32 = T.exp(B[0], dtype="float32") A[i] = (x + 1.0) / (x - 1.0) @T.prim_func def func_rewritten(A: T.Buffer[(8,), "float32"]) -> None: - B = T.allocate((1,), "float32", "global") + B_data = T.allocate((1,), "float32", "global") + B = T.buffer_decl(shape=[1], dtype="float32", data=B_data) for i in range(8): B[0] = 3.14 x: T.float32 = T.exp(B[0], dtype="float32") diff --git a/tests/python/unittest/test_tir_transform_unroll_loop.py b/tests/python/unittest/test_tir_transform_unroll_loop.py index 6dba694e45ac..3a638ba45122 100644 --- a/tests/python/unittest/test_tir_transform_unroll_loop.py +++ b/tests/python/unittest/test_tir_transform_unroll_loop.py @@ -117,16 +117,19 @@ class before: @T.prim_func def main(): for i in T.unroll(2): - with T.allocate([16], "float32", "global") as buf: + with T.allocate([16], "float32", "global") as buf_data: + buf = T.buffer_decl(shape=[16], dtype="float32", data=buf_data) buf[0] = 0.0 @tvm.script.ir_module class expected: @T.prim_func def main(): - with T.allocate([16], "float32", "global") as buf1: + with T.allocate([16], "float32", "global") as buf1_data: + buf1 = T.buffer_decl(shape=[16], dtype="float32", data=buf1_data) buf1[0] = 0.0 - with T.allocate([16], "float32", "global") as buf2: + with T.allocate([16], "float32", "global") as buf2_data: + buf2 = T.buffer_decl(shape=[16], dtype="float32", data=buf2_data) buf2[0] = 0.0 after = tvm.tir.transform.UnrollLoop()(before) diff --git a/tests/python/unittest/test_tir_usmp_algo.py b/tests/python/unittest/test_tir_usmp_algo.py index f67148189d8c..265e6fe5d5d5 100644 --- a/tests/python/unittest/test_tir_usmp_algo.py +++ b/tests/python/unittest/test_tir_usmp_algo.py @@ -316,12 +316,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_21 = T.match_buffer(T_cast_20, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_7 = T.allocate([157323], "int16", "global") + PaddedInput_7 = T.decl_buffer([157323], "int16") for i0_i1_fused_7 in T.serial(0, 229): for i2_7, i3_7 in T.grid(229, 3): PaddedInput_7[(((i0_i1_fused_7*687) + (i2_7*3)) + i3_7)] = T.if_then_else(((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and (2 <= i2_7)) and (i2_7 < 226)), placeholder_65[((((i0_i1_fused_7*672) + (i2_7*3)) + i3_7) - 1350)], T.int16(0), dtype="int16") for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544): - Conv2dOutput_7 = T.allocate([64], "int32", "global") + Conv2dOutput_7 = T.decl_buffer([64], "int32") for ff_3 in T.serial(0, 64): Conv2dOutput_7[ff_3] = 0 for ry_2, rx_2, rc_7 in T.grid(7, 7, 3): @@ -336,7 +336,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body - tensor_2 = T.allocate([200704], "uint8", "global") + tensor_2 = T.decl_buffer([200704], "uint8") for ax0_ax1_fused_4 in T.serial(0, 56): for ax2_4 in T.serial(0, 56): for ax3_init in T.serial(0, 64): @@ -356,9 +356,9 @@ def run_model(input: T.handle, output: T.handle) -> None: T.attr("default", "device_type", 1) sid_9 = T.allocate([301056], "int8", "global") sid_8 = T.allocate([802816], "int8", "global") - T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9.data, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8.data, output, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8, output, dtype="int32")) __tvm_meta__ = None # fmt: on @@ -436,11 +436,11 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla placeholder_15 = T.match_buffer(placeholder_12, [64], dtype="int32") T_cast_5 = T.match_buffer(T_cast_4, [360000], dtype="int16") # body - PaddedInput_1 = T.allocate([379456], "int16", "global") + PaddedInput_1 = T.decl_buffer([379456], "int16") for i0_i1_fused_1, i2_1, i3_1 in T.grid(77, 77, 64): PaddedInput_1[i0_i1_fused_1 * 4928 + i2_1 * 64 + i3_1] = T.if_then_else(1 <= i0_i1_fused_1 and i0_i1_fused_1 < 76 and 1 <= i2_1 and i2_1 < 76, placeholder_13[i0_i1_fused_1 * 4800 + i2_1 * 64 + i3_1 - 4864], T.int16(0), dtype="int16") for ax0_ax1_fused_ax2_fused_1 in T.serial(0, 5625): - Conv2dOutput_1 = T.allocate([64], "int32", "global") + Conv2dOutput_1 = T.decl_buffer([64], "int32") for ff_1 in T.serial(0, 64): Conv2dOutput_1[ff_1] = 0 for ry, rx, rc_1 in T.grid(3, 3, 64): @@ -457,11 +457,11 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s placeholder_21 = T.match_buffer(placeholder_18, [256], dtype="int32") T_add_1 = T.match_buffer(T_add, [1440000], dtype="int32") # body - PaddedInput_2 = T.allocate([360000], "int16", "global") + PaddedInput_2 = T.decl_buffer([360000], "int16") for i0_i1_fused_2, i2_2, i3_2 in T.grid(75, 75, 64): PaddedInput_2[i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2] = placeholder_19[i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2] for ax0_ax1_fused_ax2_fused_2 in T.serial(0, 5625): - Conv2dOutput_2 = T.allocate([64], "int32", "global") + Conv2dOutput_2 = T.decl_buffer([64], "int32") for ax3_outer_1 in T.serial(0, 4): for ff_2 in T.serial(0, 64): Conv2dOutput_2[ff_2] = 0 @@ -480,11 +480,11 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s placeholder_28 = T.match_buffer(placeholder_25, [1440000], dtype="int32") T_cast_7 = T.match_buffer(T_cast_6, [1440000], dtype="uint8") # body - PaddedInput_3 = T.allocate([360000], "int16", "global") + PaddedInput_3 = T.decl_buffer([360000], "int16") for i0_i1_fused_3, i2_3, i3_3 in T.grid(75, 75, 64): PaddedInput_3[i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3] = placeholder_29[i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3] for ax0_ax1_fused_ax2_fused_3 in T.serial(0, 5625): - Conv2dOutput_3 = T.allocate([64], "int32", "global") + Conv2dOutput_3 = T.decl_buffer([64], "int32") for ax3_outer_2 in T.serial(0, 4): for ff_3 in T.serial(0, 64): Conv2dOutput_3[ff_3] = 0 @@ -504,11 +504,11 @@ def tvmgen_default_run_model(input: T.handle, output: T.handle) -> None: sid_6 = T.allocate([5760000], "int8", "global") sid_7 = T.allocate([720000], "int8", "global") sid_8 = T.allocate([720000], "int8", "global") - T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast", input, T.lookup_param("p0", dtype="handle"), sid_2.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", sid_2.data, T.lookup_param("p3", dtype="handle"), T.lookup_param("p4", dtype="handle"), sid_8.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", sid_8.data, T.lookup_param("p5", dtype="handle"), T.lookup_param("p6", dtype="handle"), sid_7.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_", sid_7.data, T.lookup_param("p7", dtype="handle"), T.lookup_param("p8", dtype="handle"), sid_6.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_", sid_2.data, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_6.data, output, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast", input, T.lookup_param("p0", dtype="handle"), sid_2, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", sid_2, T.lookup_param("p3", dtype="handle"), T.lookup_param("p4", dtype="handle"), sid_8, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", sid_8, T.lookup_param("p5", dtype="handle"), T.lookup_param("p6", dtype="handle"), sid_7, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_", sid_7, T.lookup_param("p7", dtype="handle"), T.lookup_param("p8", dtype="handle"), sid_6, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_", sid_2, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_6, output, dtype="int32")) @T.prim_func def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(placeholder_4: T.handle, placeholder_5: T.handle, placeholder_6: T.handle, T_cast_2: T.handle) -> None: @@ -519,11 +519,11 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place placeholder_9 = T.match_buffer(placeholder_6, [64], dtype="int32") T_cast_3 = T.match_buffer(T_cast_2, [360000], dtype="int16") # body - PaddedInput = T.allocate([360000], "int16", "global") + PaddedInput = T.decl_buffer([360000], "int16") for i0_i1_fused, i2, i3 in T.grid(75, 75, 64): PaddedInput[i0_i1_fused * 4800 + i2 * 64 + i3] = placeholder_7[i0_i1_fused * 4800 + i2 * 64 + i3] for ax0_ax1_fused_ax2_fused in T.serial(0, 5625): - Conv2dOutput = T.allocate([64], "int32", "global") + Conv2dOutput = T.decl_buffer([64], "int32") for ff in T.serial(0, 64): Conv2dOutput[ff] = 0 for rc in T.serial(0, 64): diff --git a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py index 60360ecade70..52880e40cbee 100644 --- a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py +++ b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py @@ -128,12 +128,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_7 = T.allocate([157323], "int16", "global") + PaddedInput_7 = T.decl_buffer([157323], "int16") for i0_i1_fused_7 in T.serial(0, 229): for i2_7, i3_7 in T.grid(229, 3): PaddedInput_7[(((i0_i1_fused_7*687) + (i2_7*3)) + i3_7)] = T.if_then_else(((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and (2 <= i2_7)) and (i2_7 < 226)), placeholder_65[((((i0_i1_fused_7*672) + (i2_7*3)) + i3_7) - 1350)], T.int16(0), dtype="int16") for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544): - Conv2dOutput_7 = T.allocate([64], "int32", "global") + Conv2dOutput_7 = T.decl_buffer([64], "int32") for ff_3 in T.serial(0, 64): Conv2dOutput_7[ff_3] = 0 for ry_2, rx_2, rc_7 in T.grid(7, 7, 3): @@ -148,7 +148,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body - tensor_2 = T.allocate([200704], "uint8", "global") + tensor_2 = T.decl_buffer([200704], "uint8") for ax0_ax1_fused_4 in T.serial(0, 56): for ax2_4 in T.serial(0, 56): for ax3_init in T.serial(0, 64): @@ -168,9 +168,9 @@ def run_model(input: T.handle, output: T.handle) -> None: T.attr("default", "device_type", 1) sid_9 = T.allocate([301056], "int8", "global") sid_8 = T.allocate([802816], "int8", "global") - T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9.data, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8.data, output, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8, output, dtype="int32")) __tvm_meta__ = None # fmt: on @@ -220,14 +220,14 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placehol placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_8 = T.allocate([215296], "int16", "global") + PaddedInput_8 = T.decl_buffer([215296], "int16") for i0_i1_fused_8 in T.serial(0, 58): for i2_8, i3_8 in T.grid(58, 64): PaddedInput_8[(((i0_i1_fused_8*3712) + (i2_8*64)) + i3_8)] = T.if_then_else(((((1 <= i0_i1_fused_8) and (i0_i1_fused_8 < 57)) and (1 <= i2_8)) and (i2_8 < 57)), placeholder_71[((((i0_i1_fused_8*3584) + (i2_8*64)) + i3_8) - 3648)], T.int16(0), dtype="int16") for ax0_ax1_fused_ax2_fused_8 in T.parallel(0, 3136): - dummy_allocate = T.allocate([1], "int32", "global") + dummy_allocate = T.decl_buffer([1], "int32") for ax3_outer_4 in T.serial(0, 3): - Conv2dOutput_8 = T.allocate([64], "int32", "global") + Conv2dOutput_8 = T.decl_buffer([64], "int32") for ff_4 in T.serial(0, 64): Conv2dOutput_8[ff_4] = 0 for ry_3, rx_3, rc_8 in T.grid(3, 3, 64): @@ -261,14 +261,14 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placehol placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_8 = T.allocate([215296], "int16", "global") + PaddedInput_8 = T.decl_buffer([215296], "int16") for i0_i1_fused_8 in T.serial(0, 58): for i2_8, i3_8 in T.grid(58, 64): PaddedInput_8[(((i0_i1_fused_8*3712) + (i2_8*64)) + i3_8)] = T.if_then_else(((((1 <= i0_i1_fused_8) and (i0_i1_fused_8 < 57)) and (1 <= i2_8)) and (i2_8 < 57)), placeholder_71[((((i0_i1_fused_8*3584) + (i2_8*64)) + i3_8) - 3648)], T.int16(0), dtype="int16") for ax0_ax1_fused_ax2_fused_8 in T.serial(0, 3136): - dummy_allocate = T.allocate([1], "int32", "global") + dummy_allocate = T.decl_buffer([1], "int32") for ax3_outer_4 in T.serial(0, 3): - Conv2dOutput_8 = T.allocate([64], "int32", "global") + Conv2dOutput_8 = T.decl_buffer([64], "int32") for ff_4 in T.serial(0, 64): Conv2dOutput_8[ff_4] = 0 for ry_3, rx_3, rc_8 in T.grid(3, 3, 64): @@ -394,12 +394,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place placeholder_21 = T.match_buffer(placeholder_18, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_3 = T.match_buffer(T_cast_2, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput = T.allocate([200704], "int16", "global") + PaddedInput = T.decl_buffer([200704], "int16") for i0_i1_fused in T.serial(0, 56): for i2, i3 in T.grid(56, 64): PaddedInput[(((i0_i1_fused*3584) + (i2*64)) + i3)] = placeholder_19[(((i0_i1_fused*3584) + (i2*64)) + i3)] for ax0_ax1_fused_ax2_fused in T.serial(0, 3136): - Conv2dOutput = T.allocate([64], "int32", "global") + Conv2dOutput = T.decl_buffer([64], "int32") for ff in T.serial(0, 64): Conv2dOutput[ff] = 0 for rc in T.serial(0, 64): @@ -416,12 +416,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla placeholder_27 = T.match_buffer(placeholder_24, [96], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_5 = T.match_buffer(T_cast_4, [153], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_1 = T.allocate([150528], "int16", "global") + PaddedInput_1 = T.decl_buffer([150528], "int16") for i0_i1_fused_1 in T.serial(0, 28): for i2_1, i3_1 in T.grid(28, 192): PaddedInput_1[(((i0_i1_fused_1*5376) + (i2_1*192)) + i3_1)] = placeholder_25[(((i0_i1_fused_1*5376) + (i2_1*192)) + i3_1)] for ax0_ax1_fused_ax2_fused_1 in T.serial(0, 784): - Conv2dOutput_1 = T.allocate([1], "int32", "global") + Conv2dOutput_1 = T.decl_buffer([1], "int32") for ax3_1 in T.serial(0, 96): Conv2dOutput_1[0] = 0 for rc_1 in T.serial(0, 192): @@ -435,7 +435,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body - tensor_2 = T.allocate([200704], "uint8", "global") + tensor_2 = T.decl_buffer([200704], "uint8") for ax0_ax1_fused_4 in T.serial(0, 56): for ax2_4 in T.serial(0, 56): for ax3_init in T.serial(0, 64): @@ -455,12 +455,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2(placehol placeholder_35 = T.match_buffer(placeholder_32, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_9 = T.match_buffer(T_cast_8, [121], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_2 = T.allocate([150528], "int16", "global") + PaddedInput_2 = T.decl_buffer([150528], "int16") for i0_i1_fused_2 in T.serial(0, 28): for i2_2, i3_2 in T.grid(28, 192): PaddedInput_2[(((i0_i1_fused_2*5376) + (i2_2*192)) + i3_2)] = placeholder_33[(((i0_i1_fused_2*5376) + (i2_2*192)) + i3_2)] for ax0_ax1_fused_ax2_fused_2 in T.serial(0, 784): - Conv2dOutput_2 = T.allocate([64], "int32", "global") + Conv2dOutput_2 = T.decl_buffer([64], "int32") for ff_1 in T.serial(0, 64): Conv2dOutput_2[ff_1] = 0 for rc_2 in T.serial(0, 192): @@ -475,7 +475,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast_1(placeholder_36: T.handle, T_cast_1 placeholder_37 = T.match_buffer(placeholder_36, [150528], dtype="uint8", elem_offset=0, align=64, offset_factor=1) T_cast_11 = T.match_buffer(T_cast_10, [249], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body - tensor_3 = T.allocate([150528], "uint8", "global") + tensor_3 = T.decl_buffer([150528], "uint8") for ax0_ax1_fused_6 in T.serial(0, 28): for ax2_6 in T.serial(0, 28): for ax3_outer_init_1, ax3_inner_init_1 in T.grid(3, 64): @@ -495,12 +495,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed placeholder_43 = T.match_buffer(placeholder_40, [32], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_13 = T.match_buffer(T_cast_12, [89], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_3 = T.allocate([150528], "int16", "global") + PaddedInput_3 = T.decl_buffer([150528], "int16") for i0_i1_fused_3 in T.serial(0, 28): for i2_3, i3_3 in T.grid(28, 192): PaddedInput_3[(((i0_i1_fused_3*5376) + (i2_3*192)) + i3_3)] = placeholder_41[(((i0_i1_fused_3*5376) + (i2_3*192)) + i3_3)] for ax0_ax1_fused_ax2_fused_3 in T.serial(0, 784): - Conv2dOutput_3 = T.allocate([1], "int32", "global") + Conv2dOutput_3 = T.decl_buffer([1], "int32") for ax3_5 in T.serial(0, 32): Conv2dOutput_3[0] = 0 for rc_3 in T.serial(0, 192): @@ -516,12 +516,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(pla placeholder_49 = T.match_buffer(placeholder_46, [16], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_15 = T.match_buffer(T_cast_14, [73], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_4 = T.allocate([150528], "int16", "global") + PaddedInput_4 = T.decl_buffer([150528], "int16") for i0_i1_fused_4 in T.serial(0, 28): for i2_4, i3_4 in T.grid(28, 192): PaddedInput_4[(((i0_i1_fused_4*5376) + (i2_4*192)) + i3_4)] = placeholder_47[(((i0_i1_fused_4*5376) + (i2_4*192)) + i3_4)] for ax0_ax1_fused_ax2_fused_4 in T.serial(0, 784): - Conv2dOutput_4 = T.allocate([1], "int32", "global") + Conv2dOutput_4 = T.decl_buffer([1], "int32") for ax3_6 in T.serial(0, 16): Conv2dOutput_4[0] = 0 for rc_4 in T.serial(0, 192): @@ -537,12 +537,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed placeholder_55 = T.match_buffer(placeholder_52, [32], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_17 = T.match_buffer(T_cast_16, [89], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_5 = T.allocate([14400], "int16", "global") + PaddedInput_5 = T.decl_buffer([14400], "int16") for i0_i1_fused_5 in T.serial(0, 30): for i2_5, i3_5 in T.grid(30, 16): PaddedInput_5[(((i0_i1_fused_5*480) + (i2_5*16)) + i3_5)] = T.if_then_else(((((1 <= i0_i1_fused_5) and (i0_i1_fused_5 < 29)) and (1 <= i2_5)) and (i2_5 < 29)), placeholder_53[((((i0_i1_fused_5*448) + (i2_5*16)) + i3_5) - 464)], T.int16(0), dtype="int16") for ax0_ax1_fused_ax2_fused_5 in T.serial(0, 784): - Conv2dOutput_5 = T.allocate([1], "int32", "global") + Conv2dOutput_5 = T.decl_buffer([1], "int32") for ax3_7 in T.serial(0, 32): Conv2dOutput_5[0] = 0 for ry, rx, rc_5 in T.grid(3, 3, 16): @@ -558,12 +558,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed placeholder_61 = T.match_buffer(placeholder_58, [128], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_19 = T.match_buffer(T_cast_18, [185], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_6 = T.allocate([86400], "int16", "global") + PaddedInput_6 = T.decl_buffer([86400], "int16") for i0_i1_fused_6 in T.serial(0, 30): for i2_6, i3_6 in T.grid(30, 96): PaddedInput_6[(((i0_i1_fused_6*2880) + (i2_6*96)) + i3_6)] = T.if_then_else(((((1 <= i0_i1_fused_6) and (i0_i1_fused_6 < 29)) and (1 <= i2_6)) and (i2_6 < 29)), placeholder_59[((((i0_i1_fused_6*2688) + (i2_6*96)) + i3_6) - 2784)], T.int16(0), dtype="int16") for ax0_ax1_fused_ax2_fused_6 in T.serial(0, 784): - Conv2dOutput_6 = T.allocate([64], "int32", "global") + Conv2dOutput_6 = T.decl_buffer([64], "int32") for ax3_outer_3 in T.serial(0, 2): for ff_2 in T.serial(0, 64): Conv2dOutput_6[ff_2] = 0 @@ -581,12 +581,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_7 = T.allocate([157323], "int16", "global") + PaddedInput_7 = T.decl_buffer([157323], "int16") for i0_i1_fused_7 in T.serial(0, 229): for i2_7, i3_7 in T.grid(229, 3): PaddedInput_7[(((i0_i1_fused_7*687) + (i2_7*3)) + i3_7)] = T.if_then_else(((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and (2 <= i2_7)) and (i2_7 < 226)), placeholder_65[((((i0_i1_fused_7*672) + (i2_7*3)) + i3_7) - 1350)], T.int16(0), dtype="int16") for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544): - Conv2dOutput_7 = T.allocate([64], "int32", "global") + Conv2dOutput_7 = T.decl_buffer([64], "int32") for ff_3 in T.serial(0, 64): Conv2dOutput_7[ff_3] = 0 for ry_2, rx_2, rc_7 in T.grid(7, 7, 3): @@ -603,12 +603,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1(placehol placeholder_73 = T.match_buffer(placeholder_70, [192], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_23 = T.match_buffer(T_cast_22, [305], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_8 = T.allocate([215296], "int16", "global") + PaddedInput_8 = T.decl_buffer([215296], "int16") for i0_i1_fused_8 in T.serial(0, 58): for i2_8, i3_8 in T.grid(58, 64): PaddedInput_8[(((i0_i1_fused_8*3712) + (i2_8*64)) + i3_8)] = T.if_then_else(((((1 <= i0_i1_fused_8) and (i0_i1_fused_8 < 57)) and (1 <= i2_8)) and (i2_8 < 57)), placeholder_71[((((i0_i1_fused_8*3584) + (i2_8*64)) + i3_8) - 3648)], T.int16(0), dtype="int16") for ax0_ax1_fused_ax2_fused_8 in T.serial(0, 3136): - Conv2dOutput_8 = T.allocate([64], "int32", "global") + Conv2dOutput_8 = T.decl_buffer([64], "int32") for ax3_outer_4 in T.serial(0, 3): for ff_4 in T.serial(0, 64): Conv2dOutput_8[ff_4] = 0 @@ -638,21 +638,21 @@ def run_model(input: T.handle, output: T.handle) -> None: sid_25 = T.allocate([25088], "int8", "global") sid_26 = T.allocate([25088], "int8", "global") sid_31 = T.allocate([25088], "int8", "global") - T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9.data, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8.data, sid_7.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", sid_7.data, T.lookup_param("p3", dtype="handle"), T.lookup_param("p4", dtype="handle"), sid_6.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1", sid_6.data, T.lookup_param("p5", dtype="handle"), T.lookup_param("p6", dtype="handle"), sid_5.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d", sid_5.data, sid_4.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_cast", sid_4.data, sid_3.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2", sid_3.data, T.lookup_param("p7", dtype="handle"), T.lookup_param("p8", dtype="handle"), sid_2.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", sid_3.data, T.lookup_param("p9", dtype="handle"), T.lookup_param("p10", dtype="handle"), sid_20.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320_", sid_20.data, T.lookup_param("p11", dtype="handle"), T.lookup_param("p12", dtype="handle"), sid_19.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2", sid_3.data, T.lookup_param("p13", dtype="handle"), T.lookup_param("p14", dtype="handle"), sid_26.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__1", sid_26.data, T.lookup_param("p15", dtype="handle"), T.lookup_param("p16", dtype="handle"), sid_25.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast_1", sid_4.data, sid_32.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__2", sid_32.data, T.lookup_param("p17", dtype="handle"), T.lookup_param("p18", dtype="handle"), sid_31.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_concatenate", sid_2.data, sid_19.data, sid_25.data, sid_31.data, output, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8, sid_7, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", sid_7, T.lookup_param("p3", dtype="handle"), T.lookup_param("p4", dtype="handle"), sid_6, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_1", sid_6, T.lookup_param("p5", dtype="handle"), T.lookup_param("p6", dtype="handle"), sid_5, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d", sid_5, sid_4, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_cast", sid_4, sid_3, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_2", sid_3, T.lookup_param("p7", dtype="handle"), T.lookup_param("p8", dtype="handle"), sid_2, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", sid_3, T.lookup_param("p9", dtype="handle"), T.lookup_param("p10", dtype="handle"), sid_20, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320_", sid_20, T.lookup_param("p11", dtype="handle"), T.lookup_param("p12", dtype="handle"), sid_19, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2", sid_3, T.lookup_param("p13", dtype="handle"), T.lookup_param("p14", dtype="handle"), sid_26, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__1", sid_26, T.lookup_param("p15", dtype="handle"), T.lookup_param("p16", dtype="handle"), sid_25, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast_1", sid_4, sid_32, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_fixed_point_multiply_cli_4464294615199028320__2", sid_32, T.lookup_param("p17", dtype="handle"), T.lookup_param("p18", dtype="handle"), sid_31, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_concatenate", sid_2, sid_19, sid_25, sid_31, output, dtype="int32")) __tvm_meta__ = None # fmt: on @@ -1129,11 +1129,11 @@ def tvmgen_default_fused_nn_contrib_conv2d_NCHWc(placeholder_2: T.handle, placeh placeholder_5 = T.match_buffer(placeholder_3, [81], dtype="float32") conv2d_NCHWc_1 = T.match_buffer(conv2d_NCHWc, [41], dtype="float32") # body - data_pad = T.allocate([1092], "float32", "global") + data_pad = T.decl_buffer([1092], "float32") for i0_i1_fused_i2_fused, i3, i4 in T.grid(26, 14, 3): data_pad[i0_i1_fused_i2_fused * 42 + i3 * 3 + i4] = T.if_then_else(1 <= i0_i1_fused_i2_fused and i0_i1_fused_i2_fused < 25 and 1 <= i3 and i3 < 13, placeholder_4[i0_i1_fused_i2_fused * 36 + i3 * 3 + i4 - 39], T.float32(0), dtype="float32") for n_oc_chunk_fused_oh_fused in T.serial(0, 24): - conv2d_NCHWc_global = T.allocate([36], "float32", "global") + conv2d_NCHWc_global = T.decl_buffer([36], "float32") for oc_block_c_init in T.serial(0, 3): conv2d_NCHWc_global[oc_block_c_init] = T.float32(0) for oc_block_c_init in T.serial(0, 3): @@ -1198,15 +1198,15 @@ def tvmgen_default_fused_nn_softmax_add_add_multiply_add(placeholder_6: T.handle T_add_1 = T.match_buffer(T_add, [864], dtype="float32") # body for ax0_ax1_fused_ax2_fused in T.serial(0, 72): - T_softmax_norm = T.allocate([12], "float32", "global") - with T.allocate([1], "float32", "global") as T_softmax_maxelem: + T_softmax_norm = T.decl_buffer([12], "float32") + with T.decl_buffer([1], "float32") as T_softmax_maxelem: T_softmax_maxelem[0] = T.float32(-3.4028234663852886e+38) for k in T.serial(0, 12): T_softmax_maxelem[0] = T.max(T_softmax_maxelem[0], placeholder_11[ax0_ax1_fused_ax2_fused * 12 + k]) - T_softmax_exp = T.allocate([12], "float32", "global") + T_softmax_exp = T.decl_buffer([12], "float32") for i3 in T.serial(0, 12): T_softmax_exp[i3] = T.exp(placeholder_11[ax0_ax1_fused_ax2_fused * 12 + i3] - T_softmax_maxelem[0], dtype="float32") - T_softmax_expsum = T.allocate([1], "float32", "global") + T_softmax_expsum = T.decl_buffer([1], "float32") T_softmax_expsum[0] = T.float32(0) for k in T.serial(0, 12): T_softmax_expsum[0] = T_softmax_expsum[0] + T_softmax_exp[k] @@ -1224,8 +1224,8 @@ def tvmgen_default_fused_nn_contrib_dense_pack_nn_relu(placeholder_16: T.handle, T_relu_1 = T.match_buffer(T_relu, [864], dtype="float32") # body for ax1_outer_ax0_outer_fused in T.serial(0, 18): - compute = T.allocate([48], "float32", "global") - with T.allocate([48], "float32", "global") as compute_global: + compute = T.decl_buffer([48], "float32") + with T.decl_buffer([48], "float32") as compute_global: for x_c_init in T.serial(0, 6): compute_global[x_c_init] = T.float32(0) for x_c_init in T.serial(0, 6): @@ -1317,15 +1317,15 @@ def tvmgen_default_fused_nn_softmax_add(placeholder_26: T.handle, placeholder_27 T_add_3 = T.match_buffer(T_add_2, [864], dtype="float32") # body for ax0_ax1_fused_ax2_fused in T.serial(0, 72): - T_softmax_norm = T.allocate([12], "float32", "global") - with T.allocate([1], "float32", "global") as T_softmax_maxelem: + T_softmax_norm = T.decl_buffer([12], "float32") + with T.decl_buffer([1], "float32") as T_softmax_maxelem: T_softmax_maxelem[0] = T.float32(-3.4028234663852886e+38) for k in T.serial(0, 12): T_softmax_maxelem[0] = T.max(T_softmax_maxelem[0], placeholder_28[ax0_ax1_fused_ax2_fused * 12 + k]) - T_softmax_exp = T.allocate([12], "float32", "global") + T_softmax_exp= T.decl_buffer([12], "float32") for i3 in T.serial(0, 12): T_softmax_exp[i3] = T.exp(placeholder_28[ax0_ax1_fused_ax2_fused * 12 + i3] - T_softmax_maxelem[0], dtype="float32") - T_softmax_expsum = T.allocate([1], "float32", "global") + T_softmax_expsum = T.decl_buffer([1], "float32") T_softmax_expsum[0] = T.float32(0) for k in T.serial(0, 12): T_softmax_expsum[0] = T_softmax_expsum[0] + T_softmax_exp[k] @@ -1359,20 +1359,20 @@ def run_model(data: T.handle, output: T.handle) -> None: sid_22 = T.allocate_const([1], "int8", [1]) sid_23 = T.allocate_const([2,1], "int8", [3456]) - T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform_1", data_buffer.data, sid_23.data, dtype="int32")) - T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_conv2d_NCHWc", sid_8.data, T.cast(T.lookup_param("p0", dtype="handle"), "handle"), sid_7.data, dtype="int32")) - T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform", sid_7.data, sid_6.data, dtype="int32")) - T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape_1", data_buffer.data, sid_12.data, dtype="int32")) - T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_dense_pack_nn_relu", sid_12.data, T.cast(T.lookup_param("p1", dtype="handle"), "handle"), sid_11.data, dtype="int32")) - T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape", sid_11.data, sid_10.data, dtype="int32")) - T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_softmax_add_add_multiply_add", sid_6.data, sid_10.data, T.cast(T.lookup_param("p2", dtype="handle"), "handle"), T.cast(T.lookup_param("p3", dtype="handle"), "handle"), T.cast(T.lookup_param("p4", dtype="handle"), "handle"), sid_5.data, dtype="int32")) - T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform_1", sid_5.data, sid_4.data, dtype="int32")) - T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_conv2d_NCHWc", sid_4.data, T.cast(T.lookup_param("p5", dtype="handle"), "handle"), sid_3.data, dtype="int32")) - T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform", sid_3.data, sid_2.data, dtype="int32")) - T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape_1", sid_5.data, sid_20.data, dtype="int32")) - T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_dense_pack_nn_relu", sid_20.data, T.cast(T.lookup_param("p6", dtype="handle"), "handle"), sid_19.data, dtype="int32")) - T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape", sid_19.data, sid_18.data, dtype="int32")) - T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_softmax_add", sid_2.data, sid_18.data, output_buffer.data, dtype="int32")) + T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform_1", data_buffer.data, sid_23, dtype="int32")) + T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_conv2d_NCHWc", sid_8, T.cast(T.lookup_param("p0", dtype="handle"), "handle"), sid_7, dtype="int32")) + T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform", sid_7, sid_6, dtype="int32")) + T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape_1", data_buffer.data, sid_12, dtype="int32")) + T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_dense_pack_nn_relu", sid_12, T.cast(T.lookup_param("p1", dtype="handle"), "handle"), sid_11, dtype="int32")) + T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape", sid_11, sid_10, dtype="int32")) + T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_softmax_add_add_multiply_add", sid_6, sid_10, T.cast(T.lookup_param("p2", dtype="handle"), "handle"), T.cast(T.lookup_param("p3", dtype="handle"), "handle"), T.cast(T.lookup_param("p4", dtype="handle"), "handle"), sid_5, dtype="int32")) + T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform_1", sid_5, sid_4, dtype="int32")) + T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_conv2d_NCHWc", sid_4, T.cast(T.lookup_param("p5", dtype="handle"), "handle"), sid_3, dtype="int32")) + T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform", sid_3, sid_2, dtype="int32")) + T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape_1", sid_5, sid_20, dtype="int32")) + T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_dense_pack_nn_relu", sid_20, T.cast(T.lookup_param("p6", dtype="handle"), "handle"), sid_19, dtype="int32")) + T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape", sid_19, sid_18, dtype="int32")) + T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_softmax_add", sid_2, sid_18, output_buffer.data, dtype="int32")) # fmt: on diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py index e6d123118757..fdda400a779f 100644 --- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py +++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py @@ -98,12 +98,14 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) T.preflattened_buffer(T_cast_21, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_7 = T.allocate([157323], "int16", "global") + PaddedInput_7_data = T.allocate([157323], "int16", "global") + PaddedInput_7 = T.buffer_decl(shape=[157323], dtype="int16", data=PaddedInput_7_data) for i0_i1_fused_7 in T.serial(0, 229): for i2_7, i3_7 in T.grid(229, 3): PaddedInput_7[(((i0_i1_fused_7*687) + (i2_7*3)) + i3_7)] = T.if_then_else(((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and (2 <= i2_7)) and (i2_7 < 226)), placeholder_65[((((i0_i1_fused_7*672) + (i2_7*3)) + i3_7) - 1350)], T.int16(0), dtype="int16") for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544): - Conv2dOutput_7 = T.allocate([64], "int32", "global") + Conv2dOutput_7_data = T.allocate([64], "int32", "global") + Conv2dOutput_7 = T.buffer_decl(shape=[64], dtype="int32", data=Conv2dOutput_7_data) for ff_3 in T.serial(0, 64): Conv2dOutput_7[ff_3] = 0 for ry_2, rx_2, rc_7 in T.grid(7, 7, 3): @@ -120,7 +122,8 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) T.preflattened_buffer(T_cast_7, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body - tensor_2 = T.allocate([200704], "uint8", "global") + tensor_2_data = T.allocate([200704], "uint8", "global") + tensor_2 = T.buffer_decl(shape=[200704], dtype="uint8", data=tensor_2_data) for ax0_ax1_fused_4 in T.serial(0, 56): for ax2_4 in T.serial(0, 56): for ax3_init in T.serial(0, 64): @@ -140,9 +143,9 @@ def __tvm_main__(input: T.handle, output: T.handle) -> None: T.attr("default", "device_type", 1) sid_9 = T.allocate([301056], "int8", "global") sid_8 = T.allocate([802816], "int8", "global") - T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9.data, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8.data, output, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8, output, dtype="int32")) # fmt: on @@ -299,11 +302,13 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla T_cast_5 = T.match_buffer(T_cast_4, [215], dtype="int16") T.preflattened_buffer(T_cast_5, [215], dtype="int16") # body - PaddedInput_1 = T.allocate([379456], "int16", "global") + PaddedInput_1_data = T.allocate([379456], "int16", "global") + PaddedInput_1 = T.buffer_decl(shape=[379456], dtype="int16", data=PaddedInput_1_data) for i0_i1_fused_1, i2_1, i3_1 in T.grid(77, 77, 64): PaddedInput_1[i0_i1_fused_1 * 4928 + i2_1 * 64 + i3_1] = T.if_then_else(1 <= i0_i1_fused_1 and i0_i1_fused_1 < 76 and 1 <= i2_1 and i2_1 < 76, placeholder_13[i0_i1_fused_1 * 4800 + i2_1 * 64 + i3_1 - 4864], T.int16(0), dtype="int16") for ax0_ax1_fused_ax2_fused_1 in T.serial(0, 5625): - Conv2dOutput_1 = T.allocate([64], "int32", "global") + Conv2dOutput_1_data = T.allocate([64], "int32", "global") + Conv2dOutput_1 = T.buffer_decl(shape=[64], dtype="int32", data=Conv2dOutput_1_data) for ff_1 in T.serial(0, 64): Conv2dOutput_1[ff_1] = 0 for ry, rx, rc_1 in T.grid(3, 3, 64): @@ -324,11 +329,13 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s T_add_1 = T.match_buffer(T_add, [407], dtype="int32") T.preflattened_buffer(T_add_1, [407], dtype="int32") # body - PaddedInput_2 = T.allocate([360000], "int16", "global") + PaddedInput_2_data = T.allocate([360000], "int16", "global") + PaddedInput_2 = T.buffer_decl(shape=[360000], dtype="int16", data=PaddedInput_2_data) for i0_i1_fused_2, i2_2, i3_2 in T.grid(75, 75, 64): PaddedInput_2[i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2] = placeholder_19[i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2] for ax0_ax1_fused_ax2_fused_2 in T.serial(0, 5625): - Conv2dOutput_2 = T.allocate([64], "int32", "global") + Conv2dOutput_2_data = T.allocate([64], "int32", "global") + Conv2dOutput_2 = T.buffer_decl(shape=[64], dtype="int32", data=Conv2dOutput_2_data) for ax3_outer_1 in T.serial(0, 4): for ff_2 in T.serial(0, 64): Conv2dOutput_2[ff_2] = 0 @@ -352,11 +359,13 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s T_cast_7 = T.match_buffer(T_cast_6, [407], dtype="uint8") T.preflattened_buffer(T_cast_7, [407], dtype="uint8") # body - PaddedInput_3 = T.allocate([360000], "int16", "global") + PaddedInput_3_data = T.allocate([360000], "int16", "global") + PaddedInput_3 = T.buffer_decl(shape=[360000], dtype="int16", data=PaddedInput_3_data) for i0_i1_fused_3, i2_3, i3_3 in T.grid(75, 75, 64): PaddedInput_3[i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3] = placeholder_29[i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3] for ax0_ax1_fused_ax2_fused_3 in T.serial(0, 5625): - Conv2dOutput_3 = T.allocate([64], "int32", "global") + Conv2dOutput_3_data = T.allocate([64], "int32", "global") + Conv2dOutput_3 = T.buffer_decl(shape=[64], dtype="int32", data=Conv2dOutput_3_data) for ax3_outer_2 in T.serial(0, 4): for ff_3 in T.serial(0, 64): Conv2dOutput_3[ff_3] = 0 @@ -376,11 +385,11 @@ def __tvm_main__(input: T.handle, output: T.handle) -> None: sid_6 = T.allocate([5760000], "int8", "global") sid_7 = T.allocate([720000], "int8", "global") sid_8 = T.allocate([720000], "int8", "global") - T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast", input, T.lookup_param("p0", dtype="handle"), sid_2.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", sid_2.data, T.lookup_param("p3", dtype="handle"), T.lookup_param("p4", dtype="handle"), sid_8.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", sid_8.data, T.lookup_param("p5", dtype="handle"), T.lookup_param("p6", dtype="handle"), sid_7.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_", sid_7.data, T.lookup_param("p7", dtype="handle"), T.lookup_param("p8", dtype="handle"), sid_6.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_", sid_2.data, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_6.data, output, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast", input, T.lookup_param("p0", dtype="handle"), sid_2, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", sid_2, T.lookup_param("p3", dtype="handle"), T.lookup_param("p4", dtype="handle"), sid_8, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", sid_8, T.lookup_param("p5", dtype="handle"), T.lookup_param("p6", dtype="handle"), sid_7, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_", sid_7, T.lookup_param("p7", dtype="handle"), T.lookup_param("p8", dtype="handle"), sid_6, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_", sid_2, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_6, output, dtype="int32")) @T.prim_func def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(placeholder_4: T.handle, placeholder_5: T.handle, placeholder_6: T.handle, T_cast_2: T.handle) -> None: @@ -395,11 +404,13 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place T_cast_3 = T.match_buffer(T_cast_2, [215], dtype="int16") T.preflattened_buffer(T_cast_3, [215], dtype="int16") # body - PaddedInput = T.allocate([360000], "int16", "global") + PaddedInput_data = T.allocate([360000], "int16", "global") + PaddedInput = T.buffer_decl([360000], "int16", data=PaddedInput_data) for i0_i1_fused, i2, i3 in T.grid(75, 75, 64): PaddedInput[i0_i1_fused * 4800 + i2 * 64 + i3] = placeholder_7[i0_i1_fused * 4800 + i2 * 64 + i3] for ax0_ax1_fused_ax2_fused in T.serial(0, 5625): - Conv2dOutput = T.allocate([64], "int32", "global") + Conv2dOutput_data = T.allocate([64], "int32", "global") + Conv2dOutput = T.buffer_decl([64], "int32", data=Conv2dOutput_data) for ff in T.serial(0, 64): Conv2dOutput[ff] = 0 for rc in T.serial(0, 64): diff --git a/tests/python/unittest/test_tir_usmp_utils.py b/tests/python/unittest/test_tir_usmp_utils.py index 155ff0962def..756b97b0d223 100644 --- a/tests/python/unittest/test_tir_usmp_utils.py +++ b/tests/python/unittest/test_tir_usmp_utils.py @@ -48,12 +48,12 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=64, offset_factor=1) T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=64, offset_factor=1) # body - PaddedInput_7 = T.allocate([157323], "int16", "global") + PaddedInput_7 = T.decl_buffer([157323], "int16") for i0_i1_fused_7 in T.serial(0, 229): for i2_7, i3_7 in T.grid(229, 3): PaddedInput_7[(((i0_i1_fused_7*687) + (i2_7*3)) + i3_7)] = T.if_then_else(((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and (2 <= i2_7)) and (i2_7 < 226)), placeholder_65[((((i0_i1_fused_7*672) + (i2_7*3)) + i3_7) - 1350)], T.int16(0), dtype="int16") for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544): - Conv2dOutput_7 = T.allocate([64], "int32", "global") + Conv2dOutput_7 = T.decl_buffer([64], "int32") for ff_3 in T.serial(0, 64): Conv2dOutput_7[ff_3] = 0 for ry_2, rx_2, rc_7 in T.grid(7, 7, 3): @@ -68,7 +68,7 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body - tensor_2 = T.allocate([200704], "uint8", "global") + tensor_2 = T.decl_buffer([200704], "uint8") for ax0_ax1_fused_4 in T.serial(0, 56): for ax2_4 in T.serial(0, 56): for ax3_init in T.serial(0, 64): @@ -88,9 +88,9 @@ def tvmgen_default_run_model(input: T.handle, output: T.handle) -> None: T.attr("default", "device_type", 1) sid_9 = T.allocate([301056], "int8", "global") sid_8 = T.allocate([802816], "int8", "global") - T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9.data, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8.data, dtype="int32")) - T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8.data, output, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input, T.lookup_param("p0", dtype="handle"), sid_9, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", sid_9, T.lookup_param("p1", dtype="handle"), T.lookup_param("p2", dtype="handle"), sid_8, dtype="int32")) + T.evaluate(T.call_extern("tvmgen_default_fused_nn_max_pool2d_cast", sid_8, output, dtype="int32")) __tvm_meta__ = None # fmt: on diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py index 45ea88f829ec..17622789558d 100644 --- a/tests/python/unittest/test_tvmscript_roundtrip.py +++ b/tests/python/unittest/test_tvmscript_roundtrip.py @@ -94,12 +94,18 @@ def mmult(A: T.handle, B: T.handle, C: T.handle) -> None: B_1 = T.match_buffer(B, [1024, 1024], elem_offset=0, align=64, offset_factor=1) C_1 = T.match_buffer(C, [1024 * 1024], elem_offset=0, align=64, offset_factor=1) # body - packedB = T.allocate([32768], "float32", "global") + packedB_data = T.allocate([32768], "float32", "global") + packedB = T.buffer_decl( + shape=[32768], dtype="float32", scope="global", data=packedB_data + ) for x in T.parallel(0, 32): for y in T.serial(0, 1024): packedB[T.ramp(((x * 32768) + (y * 32)), 1, 32)] = B_1[y, T.ramp(x * 32, 1, 32)] for x_outer in T.parallel(0, 32): - C_global = T.allocate([1024], "float32", "global") + C_global_data = T.allocate([1024], "float32", "global") + C_global = T.buffer_decl( + shape=[1024], dtype="float32", scope="global", data=C_global_data + ) for y_outer in T.serial(0, 32): for x_c_init in T.serial(0, 32): C_global[T.ramp((x_c_init * 32), 1, 32)] = T.broadcast(T.float32(0), 32) @@ -953,11 +959,24 @@ def func( ty = T.env_thread("threadIdx.y") tz = T.env_thread("threadIdx.z") T.launch_thread(bz, 196) - Conv_wmma_accumulator = T.allocate([2048], "float32", "wmma.accumulator") - Apad_shared = T.allocate([12288], "float16", "shared") - W_shared = T.allocate([12288], "float16", "shared") - Apad_shared_wmma_matrix_a = T.allocate([512], "float16", "wmma.matrix_a") - W_shared_wmma_matrix_b = T.allocate([1024], "float16", "wmma.matrix_b") + Conv_wmma_accumulator_data = T.allocate([2048], "float32", "wmma.accumulator") + Conv_wmma_accumulator = T.buffer_decl( + shape=[2048], dtype="float32", scope="wmma.accumulator", data=Conv_wmma_accumulator_data + ) + Apad_shared_data = T.allocate([12288], "float16", "shared") + Apad_shared = T.buffer_decl( + shape=[12288], dtype="float16", scope="shared", data=Apad_shared_data + ) + W_shared_data = T.allocate([12288], "float16", "shared") + W_shared = T.buffer_decl(shape=[12288], dtype="float16", scope="shared", data=W_shared_data) + Apad_shared_wmma_matrix_a_data = T.allocate([512], "float16", "wmma.matrix_a") + Apad_shared_wmma_matrix_a = T.buffer_decl( + shape=[512], dtype="float16", scope="wmma.matrix_a", data=Apad_shared_wmma_matrix_a_data + ) + W_shared_wmma_matrix_b_data = T.allocate([1024], "float16", "wmma.matrix_b") + W_shared_wmma_matrix_b = T.buffer_decl( + shape=[1024], dtype="float16", scope="wmma.matrix_b", data=W_shared_wmma_matrix_b_data + ) T.launch_thread(bx, 2) T.launch_thread(by, 4) T.launch_thread(ty, 4) @@ -2479,7 +2498,8 @@ def vthread_func(a: T.handle, c: T.handle) -> None: T.launch_thread(i0, 4) T.launch_thread(i1, 2) T.launch_thread(i2, 2) - B = T.allocate([16], "float32", "local") + B_data = T.allocate([16], "float32", "local") + B = T.buffer_decl(shape=[16], dtype="float32", scope="local", data=B_data) for j in range(16): B[j] = A[i0 * 64 + i1 * 32 + i2 * 16 + j] + T.float32(1) for j in range(16): @@ -2792,11 +2812,13 @@ def B(a: T.handle, c: T.handle) -> None: C = T.match_buffer(c, (10), "int32") B = T.alloc_buffer((10), "int32") - K1 = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10]) + K1_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10]) + K1 = T.buffer_decl(shape=[10], dtype="int32", data=K1_data) for x in T.serial(0, 10): B[x] = A[x] + K1[x] - K2 = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10]) + K2_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10]) + K2 = T.buffer_decl(shape=[10], dtype="int32", data=K2_data) for x in T.serial(0, 10): B[x] = B[x] + K2[x] @@ -2812,7 +2834,8 @@ def constant(a: T.handle, c: T.handle) -> None: A = T.match_buffer(a, (10), "int32") C = T.match_buffer(c, (10), "int32") B = T.alloc_buffer((10), "int32") - K = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10]) + K_data = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "int32", [10]) + K = T.buffer_decl(shape=[10], dtype="int32", data=K_data) for x in T.serial(0, 10): B[x] = A[x] + K[x] @@ -2961,7 +2984,8 @@ def primfunc_with_allocate_annotations(placeholder_28: T.handle, T_cast_6: T.han placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=64, offset_factor=1) T_cast_7 = T.match_buffer(T_cast_6, [200704], dtype="int16", elem_offset=0, align=64, offset_factor=1) # body - tensor_2 = T.allocate([200704], "uint8", "global", annotations={"attr1_key": "attr1_value"}) + tensor_2_data = T.allocate([200704], "uint8", "global", annotations={"attr1_key": "attr1_value"}) + tensor_2 = T.buffer_decl(shape=[200704], dtype="uint8", scope="global", data=tensor_2_data) for ax0_ax1_fused_4 in T.serial(0, 56): for ax2_4 in T.serial(0, 56): for ax3_init in T.serial(0, 64): @@ -2987,7 +3011,8 @@ def comm_reducer_single_reduce_group(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, [128 * 128], dtype="float32") for i in T.serial(0, 128): T.launch_thread(threadIdx_x, 128) - reduce_temp0 = T.allocate([1], "float32", "local") + reduce_temp0_data = T.allocate([1], "float32", "local") + reduce_temp0 = T.buffer_decl(shape=[1], dtype="float32", scope="local", data=reduce_temp0_data) with T.attr(T.comm_reducer(lambda x, y: x + y, [T.float32(0)]), "reduce_scope", T.reinterpret(T.uint64(0), dtype="handle")): T.evaluate(T.tvm_thread_allreduce(T.uint32(1), A[i * 128 + threadIdx_x], True, reduce_temp0.data, threadIdx_x, dtype="handle")) @@ -3002,7 +3027,8 @@ def comm_reducer_multiple_reduce_groups(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, [128 * 128], dtype="float32") for i in T.serial(0, 128): T.launch_thread(threadIdx_x, 128) - reduce_temp0 = T.allocate([1], "float32", "local") + reduce_temp0_data = T.allocate([1], "float32", "local") + reduce_temp0 = T.buffer_decl(shape=[1], dtype="float32", scope="local", data=reduce_temp0_data) with T.attr(T.comm_reducer(lambda x0, x1, y0, y1: (T.Select((x1 >= y1), x0, y0), T.Select((x1 >= y1), x1, y1)), [T.int32(-1), T.min_value("float32")]), "reduce_scope", T.reinterpret(T.uint64(0), dtype="handle")): T.evaluate(T.tvm_thread_allreduce(T.uint32(1), A[i * 128 + threadIdx_x], True, reduce_temp0.data, threadIdx_x, dtype="handle")) @@ -3149,7 +3175,8 @@ def func_T_ptr_let_statement( def func_T_ptr_allocate(): @T.prim_func def func_T_ptr_allocate() -> None: - A = T.allocate([1024], "float32", "global") + A_data = T.allocate([1024], "float32", "global") + A = T.buffer_decl(shape=[1024], dtype="float32", scope="global", data=A_data) A[0] = 0.0 return func_T_ptr_allocate @@ -3240,8 +3267,10 @@ def string_annotation_of_special_chars(): def pointer_type(): @T.prim_func def func_with_ptr_type_annotations(x: T.Ptr[T.int32], y: T.Ptr[T.int32, "shared"]): - xx = T.allocate([16], "int32", "global") - yy = T.allocate([16], "int32", "shared") + xx_data = T.allocate([16], "int32", "global") + xx = T.buffer_decl(shape=[16], dtype="int32", scope="global", data=xx_data) + yy_data = T.allocate([16], "int32", "shared") + yy = T.buffer_decl(shape=[16], dtype="int32", scope="shared", data=yy_data) a: T.Ptr[T.int32] = T.address_of(xx[0], dtype="handle") b: T.Ptr[T.int32, "shared"] = T.address_of(yy[0], dtype="handle") T.evaluate(T.call_extern("copy", a, b, dtype="")) @@ -3313,6 +3342,24 @@ def func(A: T.Buffer[(16, 16), "float32"], B: T.Buffer[(16, 16), "float32"]) -> return func +def allocate_and_decl_buffer(): + @T.prim_func + def func(A: T.Buffer[(16,), "float32"], B: T.Buffer[(16,), "float32"]) -> None: + D_data = T.allocate((16,), "float32", "global") + D = T.decl_buffer((16,), "float32", data=D_data) + for i in range(4): + with T.allocate((4,), "float32", "global") as C_data: + C = T.decl_buffer((4,), "float32", data=C_data) + for j in range(4): + C[j] = A[i * 4 + j] + T.float32(1.0) + for j in range(4): + D[j] = C[j] + for j in range(4): + B[i * 4 + j] = D[j] + + return func + + def float_infinity(): @T.prim_func def func( @@ -3374,6 +3421,7 @@ def func( let_expression, void_ptr, decl_buffer, + allocate_and_decl_buffer, float_infinity, ) From d54c0651ecae088e24fcfd448cfca31c77e8c2cb Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Wed, 31 Aug 2022 20:51:21 +0100 Subject: [PATCH 082/704] [Torch][AArch64] Skip test_load_model___wrong_language__to_pytorch (#12660) This patch makes test_load_model___wrong_language__to_pytorch to be skipped in AArch64 due to a bug that can be reproduced when enabling Integration Tests in machines with Torch installed in TVM. ``` The error message seen is: OSError: /usr/local/lib/python3.7/dist-packages/torch/lib/ libgomp-d22c30c5.so.1: cannot allocate memory in static TLS block ``` While the test needs further investigation, it is being set as skipped so other tests can be enabled and not to regress and allow time for the investigation to be made. This relates to the issue described in #10673. --- tests/python/driver/tvmc/test_frontends.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py index 98659b05ae5c..1ccac7696fcc 100644 --- a/tests/python/driver/tvmc/test_frontends.py +++ b/tests/python/driver/tvmc/test_frontends.py @@ -269,6 +269,10 @@ def test_load_quantized_model__pth(pytorch_mobilenetv2_quantized): assert p.dtype in ["int8", "uint8", "int32"] # int32 for bias +@pytest.mark.skipif( + platform.machine() == "aarch64", + reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673", +) def test_load_model___wrong_language__to_pytorch(tflite_mobilenet_v1_1_quant): # some CI environments wont offer pytorch, so skip in case it is not present pytest.importorskip("torch") From a399e6ce9759cd524fcb8f804749baa426096e4b Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Wed, 31 Aug 2022 16:10:07 -0700 Subject: [PATCH 083/704] [ci] Add linter for PR title and body (#12367) * [skip ci][ci] Fix Jenkinsfile (#12387) This got out of date after merging #12178 Co-authored-by: driazati * Address comments Co-authored-by: driazati --- Jenkinsfile | 23 +++++- ci/jenkins/Prepare.groovy.j2 | 21 ++++- ci/scripts/check_pr.py | 150 +++++++++++++++++++++++++++++++++++ ci/scripts/git_skip_ci.py | 2 +- 4 files changed, 192 insertions(+), 4 deletions(-) create mode 100644 ci/scripts/check_pr.py diff --git a/Jenkinsfile b/Jenkinsfile index 50eee01fa974..2b73508da0d3 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-08-30T11:58:06.036509 +// Generated at 2022-08-30T15:26:50.100067 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -288,7 +288,7 @@ def should_skip_ci(pr_number) { } withCredentials([string( credentialsId: 'tvm-bot-jenkins-reader', - variable: 'TOKEN', + variable: 'GITHUB_TOKEN', )]) { // Exit code of 1 means run full CI (or the script had an error, so run // full CI just in case). Exit code of 0 means skip CI. @@ -301,12 +301,31 @@ def should_skip_ci(pr_number) { return git_skip_ci_code == 0 } +def check_pr(pr_number) { + if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) { + // never skip CI on build sourced from a branch + return false + } + withCredentials([string( + credentialsId: 'tvm-bot-jenkins-reader', + variable: 'GITHUB_TOKEN', + )]) { + sh ( + script: "python3 ci/scripts/check_pr.py --pr ${pr_number}", + label: 'Check PR title and body', + ) + } + +} + def prepare() { stage('Prepare') { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") { init_git() + check_pr(env.CHANGE_ID) + if (env.DETERMINE_DOCKER_IMAGES == 'yes') { sh( script: "./ci/scripts/determine_docker_images.py ci_arm=${ci_arm} ci_cortexm=${ci_cortexm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_minimal=${ci_minimal} ci_riscv=${ci_riscv} ci_wasm=${ci_wasm} ", diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2 index 94575a7b4b64..6d0c0ec9c4b6 100644 --- a/ci/jenkins/Prepare.groovy.j2 +++ b/ci/jenkins/Prepare.groovy.j2 @@ -138,7 +138,7 @@ def should_skip_ci(pr_number) { } withCredentials([string( credentialsId: 'tvm-bot-jenkins-reader', - variable: 'TOKEN', + variable: 'GITHUB_TOKEN', )]) { // Exit code of 1 means run full CI (or the script had an error, so run // full CI just in case). Exit code of 0 means skip CI. @@ -151,12 +151,31 @@ def should_skip_ci(pr_number) { return git_skip_ci_code == 0 } +def check_pr(pr_number) { + if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) { + // never skip CI on build sourced from a branch + return false + } + withCredentials([string( + credentialsId: 'tvm-bot-jenkins-reader', + variable: 'GITHUB_TOKEN', + )]) { + sh ( + script: "python3 ci/scripts/check_pr.py --pr ${pr_number}", + label: 'Check PR title and body', + ) + } + +} + def prepare() { stage('Prepare') { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") { init_git() + check_pr(env.CHANGE_ID) + if (env.DETERMINE_DOCKER_IMAGES == 'yes') { sh( script: "./ci/scripts/determine_docker_images.py {% for image in images %}{{ image.name }}={% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %} {% endfor %}", diff --git a/ci/scripts/check_pr.py b/ci/scripts/check_pr.py new file mode 100644 index 000000000000..45d502c6a72e --- /dev/null +++ b/ci/scripts/check_pr.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import argparse +import re +import os +import textwrap +from dataclasses import dataclass +from typing import Any, List, Callable + + +from git_utils import GitHubRepo, parse_remote, git +from cmd_utils import init_log, tags_from_title + + +GITHUB_USERNAME_REGEX = re.compile(r"(@[a-zA-Z0-9-]+)", flags=re.MULTILINE) +OK = object() +FAIL = object() + + +@dataclass +class Check: + # check to run, returning OK means it passed, anything else means it failed + check: Callable[[str], Any] + + # function to call to generate the error message + error_fn: Callable[[Any], str] + + +def non_empty(s: str): + if len(s) == 0: + return FAIL + return OK + + +def usernames(s: str): + m = GITHUB_USERNAME_REGEX.findall(s) + return m if m else OK + + +def tags(s: str): + items = tags_from_title(s) + if len(items) == 0: + return FAIL + return OK + + +def trailing_period(s: str): + if s.endswith("."): + return FAIL + return OK + + +title_checks = [ + Check(check=non_empty, error_fn=lambda d: "PR must have a title but title was empty"), + Check(check=trailing_period, error_fn=lambda d: "PR must not end in a tailing '.'"), + # TODO(driazati): enable this check once https://github.com/apache/tvm/issues/12637 is done + # Check( + # check=usernames, + # error_fn=lambda d: f"PR title must not tag anyone but found these usernames: {d}", + # ), +] +body_checks = [ + Check(check=non_empty, error_fn=lambda d: "PR must have a body but body was empty"), + # TODO(driazati): enable this check once https://github.com/apache/tvm/issues/12637 is done + # Check( + # check=usernames, + # error_fn=lambda d: f"PR body must not tag anyone but found these usernames: {d}", + # ), +] + + +def run_checks(checks: List[Check], s: str, name: str) -> bool: + print(f"Running checks for {name}") + print(textwrap.indent(s, prefix=" ")) + passed = True + print(" Checks:") + for i, check in enumerate(checks): + result = check.check(s) + if result == OK: + print(f" [{i+1}] {check.check.__name__}: PASSED") + else: + passed = False + msg = check.error_fn(result) + print(f" [{i+1}] {check.check.__name__}: FAILED: {msg}") + + return passed + + +if __name__ == "__main__": + init_log() + help = "Check a PR's title and body for conformance to guidelines" + parser = argparse.ArgumentParser(description=help) + parser.add_argument("--pr", required=True) + parser.add_argument("--remote", default="origin", help="ssh remote to parse") + parser.add_argument( + "--pr-body", help="(testing) PR body to use instead of fetching from GitHub" + ) + parser.add_argument( + "--pr-title", help="(testing) PR title to use instead of fetching from GitHub" + ) + args = parser.parse_args() + + try: + pr = int(args.pr) + except ValueError: + print(f"PR was not a number: {args.pr}") + exit(0) + + if args.pr_body: + body = args.pr_body + title = args.pr_title + else: + remote = git(["config", "--get", f"remote.{args.remote}.url"]) + user, repo = parse_remote(remote) + + github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo) + pr = github.get(f"pulls/{args.pr}") + body = pr["body"] + title = pr["title"] + + body = body.strip() + title = title.strip() + + title_passed = run_checks(checks=title_checks, s=title, name="PR title") + print("") + body_passed = run_checks(checks=body_checks, s=body, name="PR body") + + if title_passed and body_passed: + print("All checks passed!") + exit(0) + else: + print( + "Some checks failed, please review the logs above and edit your PR on GitHub accordingly" + ) + exit(1) diff --git a/ci/scripts/git_skip_ci.py b/ci/scripts/git_skip_ci.py index 1e02fcb964fc..162e513275c4 100755 --- a/ci/scripts/git_skip_ci.py +++ b/ci/scripts/git_skip_ci.py @@ -46,7 +46,7 @@ def check_pr_title(): if args.pr_title: title = args.pr_title else: - github = GitHubRepo(token=os.environ["TOKEN"], user=user, repo=repo) + github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo) pr = github.get(f"pulls/{args.pr}") title = pr["title"] logging.info(f"pr title: {title}") From c6516a534fded605ae24bf56e24ec871b68ca9e2 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Wed, 31 Aug 2022 19:23:15 -0700 Subject: [PATCH 084/704] [TIR] Allow string/buffer arguments to Schedule cache_read/write (#12661) Previously, the argument needed to be an integer specifying the index into the read/write regions of a block. Now, the argument can be a string specifying the name of the buffer, or the Buffer object itself. This is a follow-up from https://github.com/apache/tvm/pull/11624. --- python/tvm/tir/schedule/schedule.py | 42 ++++++++++++++++--- .../test_tir_schedule_cache_read_write.py | 8 +++- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py index 04cc1bc26ad1..d1293371a0e0 100644 --- a/python/tvm/tir/schedule/schedule.py +++ b/python/tvm/tir/schedule/schedule.py @@ -1014,7 +1014,7 @@ def after_unroll(a: T.handle, b: T.handle) -> None: def cache_read( self, block: Union[BlockRV, str], - read_buffer_index: int, + read_buffer_index: Union[int, str, Buffer], storage_scope: str, consumer_blocks: Optional[List[Union[BlockRV, str]]] = None, ) -> BlockRV: @@ -1029,8 +1029,10 @@ def cache_read( block : Union[BlockRV, str] The consumer block of the target buffer. - read_buffer_index: int - The index of the buffer in block's read region. + buffer: Union[int, str, Buffer] + The index of the buffer in block's read region, the unique + name of a read buffer in the block, or a Buffer object + that is within the blocks read region. storage_scope: str The target storage scope. @@ -1093,13 +1095,21 @@ def after_cache_read(a: T.handle, b: T.handle) -> None: # Convert any string block names into Block RVs. consumer_blocks = [self._normalize_block_arg(b) for b in consumer_blocks] block = self._normalize_block_arg(block) + + if not isinstance(read_buffer_index, int): + _, read_buffer_index, _ = self._normalize_buffer_arg( + block, read_buffer_index, required_buffer_type="read" + ) return _ffi_api.ScheduleCacheRead( # type: ignore # pylint: disable=no-member self, block, read_buffer_index, storage_scope, consumer_blocks ) @type_checked def cache_write( - self, block: Union[BlockRV, str], write_buffer_index: int, storage_scope: str + self, + block: Union[BlockRV, str], + write_buffer_index: Union[int, str, Buffer], + storage_scope: str, ) -> BlockRV: """Create a block that reads a buffer region into a write cache. It requires: @@ -1113,7 +1123,9 @@ def cache_write( The producer block of the target buffer. write_buffer_index: int - The index of the buffer in block's write region. + The index of the buffer in block's write region, the unique + name of a write buffer in the block, or a Buffer object + that is within the blocks write region. storage_scope: str The target storage scope. @@ -1168,6 +1180,11 @@ def after_cache_write(a: T.handle, b: T.handle) -> None: """ block = self._normalize_block_arg(block) + + if not isinstance(write_buffer_index, int): + _, write_buffer_index, _ = self._normalize_buffer_arg( + block, write_buffer_index, required_buffer_type="write" + ) return _ffi_api.ScheduleCacheWrite( # type: ignore # pylint: disable=no-member self, block, write_buffer_index, storage_scope ) @@ -2352,7 +2369,10 @@ def _normalize_block_arg(self, block: Union[BlockRV, str]) -> BlockRV: return block def _normalize_buffer_arg( - self, block: BlockRV, buffer: Union[Tuple[str, int], str, Buffer] + self, + block: BlockRV, + buffer: Union[Tuple[str, int], int, str, Buffer], + required_buffer_type=None, ) -> Tuple[str, int, Buffer]: block_obj: Block = self.get(block) @@ -2364,6 +2384,9 @@ def iter_buffers(): for i, write in enumerate(block_obj.writes): yield "write", i, write.buffer + if isinstance(buffer, int): + buffer = (required_buffer_type, buffer) + if isinstance(buffer, str): possible_buffers = {} # String lookup requires ensuring that the name is unique @@ -2405,6 +2428,13 @@ def iter_buffers(): else: raise TypeError(f"Invalid type for argument 'buffer': {type(buffer)}") + if required_buffer_type is not None: + assert buffer_index_type == required_buffer_type, ( + f"Expected buffer to be read buffer, " + f"but {buffer_obj.name} was a {buffer_index_type} buffer " + f"in the specified block" + ) + return (buffer_index_type, buffer_index, buffer_obj) @type_checked diff --git a/tests/python/unittest/test_tir_schedule_cache_read_write.py b/tests/python/unittest/test_tir_schedule_cache_read_write.py index 255ca34118d0..cf4836e5361e 100644 --- a/tests/python/unittest/test_tir_schedule_cache_read_write.py +++ b/tests/python/unittest/test_tir_schedule_cache_read_write.py @@ -774,8 +774,12 @@ def test_cache_read_elementwise(use_block_name): sch = tir.Schedule(elementwise, debug_mask="all") block_b = sch.get_block("B") block_c = sch.get_block("C") - cached_a = sch.cache_read("B" if use_block_name else block_b, 0, "global") - cached_b = sch.cache_read("C" if use_block_name else block_c, 0, "local") + if use_block_name: + cached_a = sch.cache_read("B", "A", "global") + cached_b = sch.cache_read("C", "B", "local") + else: + cached_a = sch.cache_read(block_b, 0, "global") + cached_b = sch.cache_read(block_c, 0, "local") assert sch.get(cached_a) == sch.get(sch.get_block("A_global")) assert sch.get(cached_b) == sch.get(sch.get_block("B_local")) assert sch.get(block_b) == sch.get(sch.get_block("B")) From aa6c7123d0a2cdd93256c6a4576ff029008fd375 Mon Sep 17 00:00:00 2001 From: Nicola Lancellotti Date: Thu, 1 Sep 2022 08:10:55 +0100 Subject: [PATCH 085/704] [ETHOSN] Fix tests pylint errors (#12649) This pr fixes pylint errors in tests/python/contrib/test_ethosn as reported in issue #11414. --- tests/lint/pylint.sh | 1 + .../contrib/test_ethosn/infrastructure.py | 50 +++++++++++-------- .../contrib/test_ethosn/test_concatenate.py | 10 +++- .../test_ethosn/test_constant_duplication.py | 10 ++-- .../python/contrib/test_ethosn/test_conv2d.py | 18 ++++--- .../test_ethosn/test_depth_to_space.py | 4 ++ .../test_ethosn/test_fullyconnected.py | 25 +++++----- .../contrib/test_ethosn/test_leaky_relu.py | 2 + tests/python/contrib/test_ethosn/test_mean.py | 2 + .../contrib/test_ethosn/test_multiply.py | 3 ++ .../contrib/test_ethosn/test_networks.py | 13 +++-- .../test_ethosn/test_partition_params.py | 24 ++++++--- .../contrib/test_ethosn/test_pooling.py | 8 ++- tests/python/contrib/test_ethosn/test_relu.py | 4 ++ .../contrib/test_ethosn/test_requantize.py | 5 ++ .../python/contrib/test_ethosn/test_resize.py | 4 ++ .../contrib/test_ethosn/test_sigmoid.py | 11 ++-- .../python/contrib/test_ethosn/test_split.py | 9 +++- tests/python/contrib/test_ethosn/test_tanh.py | 4 ++ .../contrib/test_ethosn/test_topologies.py | 33 +++++++----- 20 files changed, 158 insertions(+), 82 deletions(-) diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh index 2228e110c15e..94fae289b6b9 100755 --- a/tests/lint/pylint.sh +++ b/tests/lint/pylint.sh @@ -21,6 +21,7 @@ python3 -m pylint python/tvm --rcfile="$(dirname "$0")"/pylintrc python3 -m pylint vta/python/vta --rcfile="$(dirname "$0")"/pylintrc python3 -m pylint tests/python/unittest/test_tvmscript_type.py --rcfile="$(dirname "$0")"/pylintrc python3 -m pylint tests/python/contrib/test_cmsisnn --rcfile="$(dirname "$0")"/pylintrc +python3 -m pylint tests/python/contrib/test_ethosn --rcfile="$(dirname "$0")"/pylintrc python3 -m pylint tests/python/relay/aot/*.py --rcfile="$(dirname "$0")"/pylintrc python3 -m pylint tests/python/ci --rcfile="$(dirname "$0")"/pylintrc python3 -m pylint tests/python/integration/ --rcfile="$(dirname "$0")"/pylintrc diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py index a1c8ca0a32d2..0071b1a7f52e 100644 --- a/tests/python/contrib/test_ethosn/infrastructure.py +++ b/tests/python/contrib/test_ethosn/infrastructure.py @@ -18,17 +18,17 @@ """Arm(R) Ethos(TM)-N test functions""" from __future__ import absolute_import, print_function -import tvm -from tvm import relay -from tvm.contrib import utils, graph_executor, download from hashlib import md5 from itertools import zip_longest, combinations +import os import numpy as np from PIL import Image -import os -from . import _infrastructure +import tvm +from tvm import relay +from tvm.contrib import utils, graph_executor, download from tvm.relay.op.contrib import partition_for_ethosn +from . import _infrastructure def get_real_image(im_height, im_width): @@ -82,23 +82,25 @@ def make_module(func, params): def make_ethosn_composite(ethosn_expr, name): - vars = relay.analysis.free_vars(ethosn_expr) - inner_vars = [relay.Var(v.name_hint, v.type_annotation) for v in vars] + variables = relay.analysis.free_vars(ethosn_expr) + inner_vars = [relay.Var(v.name_hint, v.type_annotation) for v in variables] func = relay.Function(inner_vars, ethosn_expr) func = func.with_attr("Composite", name) - call = relay.Call(func, vars) + call = relay.Call(func, variables) return call def make_ethosn_partition(ethosn_expr): + """Make an Ethos(TM)-N partition.""" + # Create an Ethos-N global function mod = tvm.IRModule({}) - vars = relay.analysis.free_vars(ethosn_expr) + variables = relay.analysis.free_vars(ethosn_expr) # NB: it is illegal to reuse variables inside and outside a scope in Relay # if you want to duplicate types and names you must re-allocate them. - fresh_vars = [relay.Var(v.name_hint, v.type_annotation) for v in vars] + fresh_vars = [relay.Var(v.name_hint, v.type_annotation) for v in variables] binds = {} - for var, fresh_var in zip(vars, fresh_vars): + for var, fresh_var in zip(variables, fresh_vars): binds[var] = fresh_var ethosn_expr_fresh = relay.bind(ethosn_expr, binds) func = relay.Function(fresh_vars, ethosn_expr_fresh) @@ -106,19 +108,21 @@ def make_ethosn_partition(ethosn_expr): func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1)) func = func.with_attr("Compiler", "ethos-n") func = func.with_attr("global_symbol", "ethos-n_0") - g1 = relay.GlobalVar("ethos-n_0") - mod[g1] = func + global_var = relay.GlobalVar("ethos-n_0") + mod[global_var] = func mod = relay.transform.InferType()(mod) # These are the vars to call the Ethos-N partition with more_vars = relay.analysis.free_vars(ethosn_expr) # Call the Ethos-N partition in main - call_fn1 = g1(*more_vars) + call_fn1 = global_var(*more_vars) mod["main"] = relay.Function(more_vars, call_fn1) return relay.transform.InferType()(mod) def get_host_op_count(mod): + """Return the number of host operators.""" + class Counter(tvm.relay.ExprVisitor): def __init__(self): super().__init__() @@ -219,9 +223,7 @@ def run(lib, inputs, outputs, npu=True): return out -def build_and_run( - mod, inputs, outputs, params, device=tvm.cpu(), npu=True, expected_host_ops=0, npu_partitions=1 -): +def build_and_run(mod, inputs, outputs, params, npu=True, expected_host_ops=0, npu_partitions=1): lib = build(mod, params, npu, expected_host_ops, npu_partitions) return run(lib, inputs, outputs, npu) @@ -254,6 +256,8 @@ def inference_result(outputs): def test_error(mod, params, err_msg): + """Test an operator error message.""" + caught = None with tvm.transform.PassContext( opt_level=3, config={"relay.ext.ethos-n.options": {"variant": get_ethosn_variant()}} @@ -262,8 +266,8 @@ def test_error(mod, params, err_msg): try: mod = relay.transform.InferType()(mod) relay.build(mod, params=params) - except tvm.error.TVMError as e: - caught = e.args[0] + except tvm.error.TVMError as error: + caught = error.args[0] finally: relay.backend.te_compiler.get().clear() @@ -275,8 +279,8 @@ def get_conv2d(var, shape, dtype): """Standard convolution to test activation functions""" weight_shape = (1, 1, shape[3], 1) - w = tvm.nd.array(np.ones(weight_shape, dtype)) - weights = relay.const(w, dtype) + weights_array = tvm.nd.array(np.ones(weight_shape, dtype)) + weights = relay.const(weights_array, dtype) conv = relay.qnn.op.conv2d( var, weights, @@ -300,13 +304,15 @@ def get_conv2d(var, shape, dtype): relay.const(0, "int32"), # output zero point out_dtype=dtype, ) - params = {"w": w, "b": b} + params = {"w": weights_array, "b": b} return req, params def get_conv2d_qnn_params( dtype, input_zp, input_sc, kernel_zp, kernel_sc, kernel_h, kernel_w, channels ): + """Return Conv2D QNN params.""" + kernel_sc = ( kernel_sc.numpy() if isinstance(kernel_sc, tvm.runtime.ndarray.NDArray) else [kernel_sc] ) diff --git a/tests/python/contrib/test_ethosn/test_concatenate.py b/tests/python/contrib/test_ethosn/test_concatenate.py index b2eba6d650e0..cd4ec7a4e4b2 100644 --- a/tests/python/contrib/test_ethosn/test_concatenate.py +++ b/tests/python/contrib/test_ethosn/test_concatenate.py @@ -57,6 +57,8 @@ def _get_model(shapes, dtype, axis): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) def test_concatenate(dtype): + """Compare Concatenate output with TVM.""" + trials = [ ([(1, 4), (1, 6)], 1), ([(1, 16, 4), (1, 16, 4)], 1), @@ -78,19 +80,23 @@ def test_concatenate(dtype): @requires_ethosn def test_concatenate_failure(): + """Check Concatenate error messages.""" + trials = [ ([(1, 4, 4, 4, 4), (1, 4, 4, 4, 4)], "uint8", 1, "dimensions=5, dimensions must be <= 4;"), ( [(1, 4, 4, 4), (1, 4, 4, 4)], "uint8", 3, - "Concatenation along the channels dimension (axis 3) requires input tensors with a multiple of 16 channels;", + "Concatenation along the channels dimension (axis 3) " + "requires input tensors with a multiple of 16 channels;", ), ( [(1, 4, 4, 4), (1, 4, 4, 4)], "int16", 2, - "dtype='int16', dtype must be either uint8, int8 or int32; dtype='int16', dtype must be either uint8, int8 or int32;", + "dtype='int16', dtype must be either uint8, int8 or int32; dtype='int16', " + "dtype must be either uint8, int8 or int32;", ), ( [(2, 4, 4, 4), (2, 4, 4, 4)], diff --git a/tests/python/contrib/test_ethosn/test_constant_duplication.py b/tests/python/contrib/test_ethosn/test_constant_duplication.py index 84956840ecbb..b3cd0046f508 100644 --- a/tests/python/contrib/test_ethosn/test_constant_duplication.py +++ b/tests/python/contrib/test_ethosn/test_constant_duplication.py @@ -36,8 +36,10 @@ def _get_model(): add_const = relay.const(add_const_value, "uint8") a = relay.add(a, add_const) weight_shape = (kernel_h, kernel_w, shape[3], out_channels) - w = tvm.nd.array(np.random.randint(low=0, high=255, size=weight_shape, dtype="uint8")) - weights = relay.const(w, "uint8") + weights_array = tvm.nd.array( + np.random.randint(low=0, high=255, size=weight_shape, dtype="uint8") + ) + weights = relay.const(weights_array, "uint8") conv = relay.qnn.op.conv2d( a, weights, @@ -66,12 +68,14 @@ def _get_model(): relay.const(0, "int32"), # output zero point out_dtype="uint8", ) - params = {"w": w, "b": b} + params = {"w": weights_array, "b": b} return req, params @requires_ethosn def test_constant_duplication(): + """Test that constants are not duplicated.""" + np.random.seed(0) model, params = _get_model() mod = tei.make_module(model, params) diff --git a/tests/python/contrib/test_ethosn/test_conv2d.py b/tests/python/contrib/test_ethosn/test_conv2d.py index a411701ea0bc..ffe66f0d2be2 100644 --- a/tests/python/contrib/test_ethosn/test_conv2d.py +++ b/tests/python/contrib/test_ethosn/test_conv2d.py @@ -17,9 +17,9 @@ """Arm(R) Ethos(TM)-N integration conv2d tests""" +import math import numpy as np import pytest -import math import tvm from tvm import relay from tvm.testing import requires_ethosn @@ -61,7 +61,7 @@ def _get_model( ): """Return a model and any parameters it may have""" a = relay.var("a", shape=shape, dtype=dtype) - if pad == "op" or pad == "both": + if pad in ("op", "both"): p = _get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides) a = relay.nn.pad( a, @@ -76,12 +76,12 @@ def _get_model( weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels) else: weight_shape = (kernel_h, kernel_w, out_channels, 1) - w = tvm.nd.array( + weights_array = tvm.nd.array( np.random.randint( np.iinfo(dtype).min, high=np.iinfo(dtype).max + 1, size=weight_shape, dtype=dtype ) ) - weights = relay.const(w, dtype) + weights = relay.const(weights_array, dtype) conv = relay.qnn.op.conv2d( a, weights, @@ -96,7 +96,7 @@ def _get_model( strides=strides, groups=groups, channels=out_channels, - padding=p if pad == "attr" or pad == "both" else (0, 0, 0, 0), + padding=p if pad in ("attr", "both") else (0, 0, 0, 0), out_dtype="int32", ) b = tvm.nd.array( @@ -118,7 +118,7 @@ def _get_model( relay.const(output_zp, "int32"), # output zero point out_dtype=dtype, ) - params = {"w": w, "b": b} + params = {"w": weights_array, "b": b} return req, params @@ -126,6 +126,8 @@ def _get_model( @pytest.mark.parametrize("depthwise", [False, True]) @pytest.mark.parametrize("dtype", ["uint8", "int8"]) def test_conv2d(dtype, depthwise): + """Compare Conv2D output with TVM.""" + trials = [ [(1, 17, 20, 26), 4, 3, 1, "attr", (2, 2), (1, 1), False], [(1, 30, 27, 30), 5, 5, 3, "none", (1, 1), (1, 1), False], @@ -208,6 +210,8 @@ def test_conv2d(dtype, depthwise): @requires_ethosn def test_conv2d_failure(): + """Check Conv2D error messages.""" + trials = [ ( (1, 4, 4, 4), @@ -326,7 +330,7 @@ def test_conv2d_failure(): weight_format, err_msg, ) in trials: - model, params = _get_model( + model, _ = _get_model( shape, kernel_h, kernel_w, diff --git a/tests/python/contrib/test_ethosn/test_depth_to_space.py b/tests/python/contrib/test_ethosn/test_depth_to_space.py index 1675b82eeace..c071fe00f212 100644 --- a/tests/python/contrib/test_ethosn/test_depth_to_space.py +++ b/tests/python/contrib/test_ethosn/test_depth_to_space.py @@ -34,6 +34,8 @@ def _get_model(shape, block, dtype, layout): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) def test_depth_to_space(dtype): + """Compare Depth To Space output with TVM.""" + trials = [ (1, 16, 16, 16), (1, 64, 32, 16), @@ -59,6 +61,8 @@ def test_depth_to_space(dtype): @requires_ethosn def test_depth_to_space_failure(): + """Check Depth To Space error messages.""" + trials = [ ((2, 16, 16, 16), 2, "uint8", "NHWC", "batch size=2, batch size must = 1"), ( diff --git a/tests/python/contrib/test_ethosn/test_fullyconnected.py b/tests/python/contrib/test_ethosn/test_fullyconnected.py index 2caca9e890a2..d5510bb79d2c 100644 --- a/tests/python/contrib/test_ethosn/test_fullyconnected.py +++ b/tests/python/contrib/test_ethosn/test_fullyconnected.py @@ -30,9 +30,9 @@ def _get_model( ): """Return a model an any parameters it may have""" a = relay.var("a", shape=shape, dtype=dtype) - w = tvm.nd.array(np.ones(weight_shape, dtype)) - weights = relay.const(w, dtype) - fc = relay.qnn.op.dense( + weights_array = tvm.nd.array(np.ones(weight_shape, dtype)) + weights = relay.const(weights_array, dtype) + dense = relay.qnn.op.dense( a, weights, input_zero_point=relay.const(input_zp, "int32"), @@ -44,7 +44,7 @@ def _get_model( ) b = tvm.nd.array(np.random.randint(0, high=255, size=(weight_shape[0],), dtype="int32")) biasc = relay.const(b, "int32") - bias = relay.nn.bias_add(fc, biasc) + bias = relay.nn.bias_add(dense, biasc) req = relay.qnn.op.requantize( bias, relay.const(input_sc * kernel_sc, "float32"), # input zero scale @@ -53,7 +53,7 @@ def _get_model( relay.const(output_zp, "int32"), # output zero point out_dtype=dtype, ) - params = {"w": w, "b": b} + params = {"w": weights_array, "b": b} return req, params @@ -76,9 +76,8 @@ def _get_model( ], ) def test_fullyconnected(shape, out_channels, dtype, input_zp, input_sc, kernel_zp, kernel_sc): - """ - Test fully connected offloading. - """ + """Compare Fully Connected output with TVM.""" + np.random.seed(0) inputs = { "a": tvm.nd.array( @@ -116,6 +115,8 @@ def test_fullyconnected(shape, out_channels, dtype, input_zp, input_sc, kernel_z @requires_ethosn def test_fullyconnected_failure(): + """Check Fully Connected error messages.""" + trials = [ ( (1, 64), @@ -139,7 +140,8 @@ def test_fullyconnected_failure(): 0, 1, "uint8", - "Weights tensor must have I dimension equal to the number of channels of the input tensor.;", + "Weights tensor must have I dimension equal to the number" + " of channels of the input tensor.;", ), ((1024, 64), (1, 64), 0, 1, 0, 1, 0, 1, "uint8", "batch size=1024, batch size must = 1;"), ] @@ -157,10 +159,7 @@ def test_fullyconnected_failure(): dtype, err_msg, ) in trials: - inputs = { - "a": tvm.nd.array(np.random.randint(0, high=255, size=shape, dtype=dtype)), - } - model, params = _get_model( + model, _ = _get_model( shape, weight_shape, input_zp, diff --git a/tests/python/contrib/test_ethosn/test_leaky_relu.py b/tests/python/contrib/test_ethosn/test_leaky_relu.py index cdd06f5e73e4..3c3bbc709679 100644 --- a/tests/python/contrib/test_ethosn/test_leaky_relu.py +++ b/tests/python/contrib/test_ethosn/test_leaky_relu.py @@ -49,6 +49,7 @@ def _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype, alpha): @pytest.mark.parametrize("alpha", [0.001, 0.5678]) def test_leaky_relu(dtype, shape, alpha): """Compare Leaky ReLU output with TVM.""" + np.random.seed(0) iinfo = np.iinfo(dtype) @@ -75,6 +76,7 @@ def test_leaky_relu(dtype, shape, alpha): @pytest.mark.parametrize("alpha", [-1.34, 2.32, 1, 0]) def test_leaky_relu_unsupported_alpha(dtype, shape, alpha): """Test unsupported values of alpha (<= 0, >= 1) in Leaky ReLU.""" + iinfo = np.iinfo(dtype) zp_min = iinfo.min diff --git a/tests/python/contrib/test_ethosn/test_mean.py b/tests/python/contrib/test_ethosn/test_mean.py index 548743fe9548..0ad7e17faed8 100644 --- a/tests/python/contrib/test_ethosn/test_mean.py +++ b/tests/python/contrib/test_ethosn/test_mean.py @@ -45,6 +45,7 @@ def _get_model(shape, axis, keepdims, input_zp, input_sc, output_zp, output_sc, @pytest.mark.parametrize("shape", [(1, 7, 7, 2048), (1, 8, 8)]) def test_mean(dtype, shape): """Compare Mean output with TVM.""" + np.random.seed(0) zp_min = np.iinfo(dtype).min @@ -68,6 +69,7 @@ def test_mean(dtype, shape): @pytest.mark.parametrize("dtype", ["int8", "uint8"]) def test_mean_non_equal_quantization(dtype): """Test mean is not offloaded when quantization is not equal.""" + np.random.seed(0) shape = (1, 7, 7, 2048) diff --git a/tests/python/contrib/test_ethosn/test_multiply.py b/tests/python/contrib/test_ethosn/test_multiply.py index 38d8516b6721..cb95a97db529 100644 --- a/tests/python/contrib/test_ethosn/test_multiply.py +++ b/tests/python/contrib/test_ethosn/test_multiply.py @@ -69,6 +69,7 @@ def _get_model( @pytest.mark.parametrize("reverse_inputs", [False, True]) def test_multiply(dtype, shape, constant_shape, reverse_inputs): """Compare Multiply output with TVM.""" + np.random.seed(0) iinfo = np.iinfo(dtype) @@ -106,6 +107,7 @@ def test_multiply(dtype, shape, constant_shape, reverse_inputs): @requires_ethosn def test_multiply_multiple_inputs_unsupported(): """Check multiply operator with two inputs is not offloaded.""" + np.random.seed(0) shape = (1, 4, 5, 6) @@ -151,6 +153,7 @@ def test_multiply_multiple_inputs_unsupported(): @requires_ethosn def test_multiply_unsupported_datatype(): """Check multiply operator with unsupported datatype is not offloaded.""" + np.random.seed(0) shape = (1, 4, 5, 6) diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py index 11745409d4ea..db1b41244846 100644 --- a/tests/python/contrib/test_ethosn/test_networks.py +++ b/tests/python/contrib/test_ethosn/test_networks.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +# pylint: disable=wrong-import-position """Arm(R) Ethos(TM)-N integration end-to-end network tests""" import pytest @@ -22,12 +22,11 @@ pytest.importorskip("tflite") pytest.importorskip("tensorflow") +import tflite.Model from tvm import relay from tvm.testing import requires_ethosn from tvm.contrib import download - import tvm.relay.testing.tf as tf_testing -import tflite.Model from . import infrastructure as tei @@ -41,10 +40,10 @@ def _get_tflite_model(tflite_model_path, inputs_dict, dtype): tflite_model = tflite.Model.GetRootAsModel(tflite_model_buffer, 0) shape_dict = {} dtype_dict = {} - for input in inputs_dict: - input_shape = inputs_dict[input] - shape_dict[input] = input_shape - dtype_dict[input] = dtype + for value in inputs_dict: + input_shape = inputs_dict[value] + shape_dict[value] = input_shape + dtype_dict[value] = dtype return relay.frontend.from_tflite( tflite_model, diff --git a/tests/python/contrib/test_ethosn/test_partition_params.py b/tests/python/contrib/test_ethosn/test_partition_params.py index 34e22e6aaba8..e8ac687c04b0 100644 --- a/tests/python/contrib/test_ethosn/test_partition_params.py +++ b/tests/python/contrib/test_ethosn/test_partition_params.py @@ -18,19 +18,23 @@ """Arm(R) Ethos(TM)-N partition parameter tests""" import pytest -import tvm -from tvm import relay import numpy as np +import tvm +from tvm import relay from tvm.relay.op.contrib.ethosn import partition_for_ethosn from tvm.testing import requires_ethosn @requires_ethosn def test_ethosn78_partition_no_error(): + """Test Arm(R) Ethos(TM)-N78 partition""" + a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8") - w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8")) - res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8") + weights = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8")) + res = relay.nn.conv2d( + a, weights, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8" + ) b = relay.var("b", shape=[8], dtype="uint8") res = relay.nn.bias_add(res, b, axis=1) @@ -41,13 +45,15 @@ def test_ethosn78_partition_no_error(): @requires_ethosn def test_ethosn78_partition_undefined_variant(): + """Test Arm(R) Ethos(TM)-N78 partition with undefined variant""" + with pytest.raises( ValueError, match=r".*Please specify a variant in the target string, e.g. -variant=n78.*" ): a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8") - w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8")) + weights = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8")) res = relay.nn.conv2d( - a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8" + a, weights, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8" ) b = relay.var("b", shape=[8], dtype="uint8") res = relay.nn.bias_add(res, b, axis=1) @@ -58,13 +64,15 @@ def test_ethosn78_partition_undefined_variant(): @requires_ethosn def test_ethosn78_partition_invalid_variant(): + """Test Arm(R) Ethos(TM)-N78 partition with invalid variant""" + with pytest.raises( ValueError, match=r".*When targeting Ethos\(TM\)-N78, -variant=n78 should be set.*" ): a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8") - w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8")) + wwights = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8")) res = relay.nn.conv2d( - a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8" + a, wwights, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8" ) b = relay.var("b", shape=[8], dtype="uint8") res = relay.nn.bias_add(res, b, axis=1) diff --git a/tests/python/contrib/test_ethosn/test_pooling.py b/tests/python/contrib/test_ethosn/test_pooling.py index 3defaa55e853..e1c7358f71a1 100644 --- a/tests/python/contrib/test_ethosn/test_pooling.py +++ b/tests/python/contrib/test_ethosn/test_pooling.py @@ -28,10 +28,10 @@ def _get_model(shape, typef, sizes, strides, pads, layout, dtype): """Return a model and any parameters it may have""" req = relay.var("a", shape=shape, dtype=dtype) - if typef == relay.nn.avg_pool2d: + if typef is relay.nn.avg_pool2d: req = relay.cast(req, "int32") req = typef(req, pool_size=sizes, strides=strides, padding=pads, ceil_mode=True, layout=layout) - if typef == relay.nn.avg_pool2d: + if typef is relay.nn.avg_pool2d: req = relay.cast(req, dtype) return req @@ -39,6 +39,8 @@ def _get_model(shape, typef, sizes, strides, pads, layout, dtype): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) def test_pooling(dtype): + """Compare Pooling output with TVM.""" + trials = [ ((1, 8, 8, 8), relay.nn.max_pool2d, (2, 2), (2, 2), (0, 0, 0, 0), "NHWC"), ((1, 9, 9, 9), relay.nn.max_pool2d, (3, 3), (2, 2), (0, 0, 0, 0), "NHWC"), @@ -65,6 +67,8 @@ def test_pooling(dtype): @requires_ethosn def test_pooling_failure(): + """Check Pooling error messages.""" + trials = [ ( (2, 8, 8, 8), diff --git a/tests/python/contrib/test_ethosn/test_relu.py b/tests/python/contrib/test_ethosn/test_relu.py index 5d3e8f1e9921..f56a1cd7ad3c 100644 --- a/tests/python/contrib/test_ethosn/test_relu.py +++ b/tests/python/contrib/test_ethosn/test_relu.py @@ -35,6 +35,8 @@ def _get_model(shape, dtype, a_min, a_max): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) def test_relu(dtype): + """Compare Relu output with TVM.""" + trials = [ ((1, 4, 4, 4), 65, 178, "uint8"), ((1, 8, 4, 2), 1, 254, "uint8"), @@ -68,6 +70,8 @@ def test_relu(dtype): @requires_ethosn def test_relu_failure(): + """Check Relu error messages.""" + trials = [ ((1, 4, 4, 4, 4), "uint8", 65, 78, "dimensions=5, dimensions must be <= 4"), ((1, 8, 4, 2), "int16", 1, 254, "dtype='int16', dtype must be either uint8, int8 or int32"), diff --git a/tests/python/contrib/test_ethosn/test_requantize.py b/tests/python/contrib/test_ethosn/test_requantize.py index e20c3beeabfa..3187c22f3391 100644 --- a/tests/python/contrib/test_ethosn/test_requantize.py +++ b/tests/python/contrib/test_ethosn/test_requantize.py @@ -43,6 +43,8 @@ def _get_model(shape, input_zp, input_sc, output_zp, output_sc, in_dtype, out_dt @pytest.mark.parametrize("out_dtype", ["int8", "uint8"]) @pytest.mark.parametrize("shape", [(1, 52, 52, 3)]) def test_requantize(in_dtype, out_dtype, shape): + """Compare Requantize output with TVM.""" + np.random.seed(0) low = 0 if in_dtype == "uint8" else -5 high = low + 10 @@ -74,6 +76,7 @@ def test_requantize_mixed_precision_with_following_op(): Checks a requantize operation that changes precision from uint8 to int8 with a following add op. """ + np.random.seed(0) shape = (1, 4, 6, 8) in_sc = 0.012566 @@ -133,6 +136,8 @@ def get_model(): @requires_ethosn def test_requantize_failure(): + """Check Requantize error messages.""" + input_sc = 0.8 output_sc = (input_sc / 128) - 0.0001 model = _get_model( diff --git a/tests/python/contrib/test_ethosn/test_resize.py b/tests/python/contrib/test_ethosn/test_resize.py index b9d807d21926..2cc641e63b5c 100644 --- a/tests/python/contrib/test_ethosn/test_resize.py +++ b/tests/python/contrib/test_ethosn/test_resize.py @@ -68,6 +68,8 @@ def _get_model( ], ) def test_resize(dtype, shape, size, coordinate_transformation_mode, rounding_method): + """Compare Resize output with TVM.""" + np.random.seed(0) zp_min = np.iinfo(dtype).min zp_max = np.iinfo(dtype).max @@ -96,6 +98,8 @@ def test_resize(dtype, shape, size, coordinate_transformation_mode, rounding_met @requires_ethosn def test_resize_failure(): + """Check Resize error messages.""" + trials = [ ( (30, 20), diff --git a/tests/python/contrib/test_ethosn/test_sigmoid.py b/tests/python/contrib/test_ethosn/test_sigmoid.py index 9947bee3b86b..ae8c301ff01a 100644 --- a/tests/python/contrib/test_ethosn/test_sigmoid.py +++ b/tests/python/contrib/test_ethosn/test_sigmoid.py @@ -45,6 +45,8 @@ def _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) def test_sigmoid(dtype): + """Compare Sigmoid output with TVM.""" + trials = [ (1, 16, 16, 16), (1, 8, 8), @@ -61,7 +63,7 @@ def test_sigmoid(dtype): } outputs = [] for npu in [False, True]: - for d in range(1, 2): + for _ in range(1, 2): if dtype == "uint8": input_zp = 0 output_zp = 0 @@ -78,21 +80,22 @@ def test_sigmoid(dtype): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) def test_sigmoid_failure(dtype): + """Check Sigmoid error messages.""" + test_zp = 0 if dtype == "uint8" else -128 trials = [ - ((2, 4, 4, 4), 64, 0.2, test_zp, 1 / 256, dtype, "batch size=2, batch size must = 1"), + ((2, 4, 4, 4), 64, 0.2, test_zp, 1 / 256, "batch size=2, batch size must = 1"), ( (1, 4, 4, 4), 64, 0.2, 3, 1, - dtype, f"output quantization params=(3, 1), must = ({test_zp}, 1/256)", ), ] - for shape, input_zp, input_sc, output_zp, output_sc, dtype, err_msg in trials: + for shape, input_zp, input_sc, output_zp, output_sc, err_msg in trials: model = _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype) model = tei.make_ethosn_composite(model, "ethos-n.qnn_sigmoid") mod = tei.make_ethosn_partition(model) diff --git a/tests/python/contrib/test_ethosn/test_split.py b/tests/python/contrib/test_ethosn/test_split.py index 4d1743d07a32..7f8787afe947 100644 --- a/tests/python/contrib/test_ethosn/test_split.py +++ b/tests/python/contrib/test_ethosn/test_split.py @@ -37,6 +37,8 @@ def _get_model(shape, dtype, splits, axis): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) def test_split(dtype): + """Compare Split output with TVM.""" + trials = [ ((1, 16, 16, 32), (2, 7, 10), 2), ((1, 12, 8, 16), 3, 1), @@ -55,7 +57,7 @@ def test_split(dtype): for npu in [False, True]: model = _get_model(shape, dtype, splits, axis) mod = tei.make_module(model, {}) - output_count = splits if type(splits) == int else len(splits) + 1 + output_count = splits if isinstance(splits, int) else len(splits) + 1 outputs.append(tei.build_and_run(mod, inputs, output_count, {}, npu=npu)) tei.verify(outputs, dtype, 0) @@ -64,6 +66,8 @@ def test_split(dtype): @pytest.mark.skip("Split is not supported by the 3.0.1 version of the driver stack.") @requires_ethosn def test_split_failure(): + """Check Split error messages.""" + trials = [ ((1, 4, 4, 4, 4), "uint8", 4, 2, "dimensions=5, dimensions must be <= 4;"), ((1, 4, 4, 4), "int16", 4, 2, "dtype='int16', dtype must be either uint8, int8 or int32;"), @@ -74,7 +78,8 @@ def test_split_failure(): "uint8", 4, 3, - "Split along the channels dimension (axis 3) requires all output sizes (specified in splitInfo.m_Sizes) to be multiples of 16;", + "Split along the channels dimension (axis 3) requires all output sizes " + "(specified in splitInfo.m_Sizes) to be multiples of 16;", ), ] diff --git a/tests/python/contrib/test_ethosn/test_tanh.py b/tests/python/contrib/test_ethosn/test_tanh.py index 8f44936fdc4f..68170601c5f8 100644 --- a/tests/python/contrib/test_ethosn/test_tanh.py +++ b/tests/python/contrib/test_ethosn/test_tanh.py @@ -46,6 +46,8 @@ def _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype): @pytest.mark.parametrize("dtype", ["uint8", "int8"]) @pytest.mark.parametrize("shape", [(1, 52, 52, 3)]) def test_tanh(dtype, shape): + """Compare Tanh output with TVM.""" + zp_min = np.iinfo(dtype).min zp_max = np.iinfo(dtype).max @@ -78,6 +80,8 @@ def test_tanh(dtype, shape): ], ) def test_tanh_failure(shape, input_zp, input_sc, output_zp, output_sc, err_msg, dtype): + """Check Tanh error messages.""" + test_zp = 0 if dtype == "int8" else 128 model = _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype) model = tei.make_ethosn_composite(model, "ethos-n.qnn_tanh") diff --git a/tests/python/contrib/test_ethosn/test_topologies.py b/tests/python/contrib/test_ethosn/test_topologies.py index 970f7dce5cbd..19d7accadb6d 100644 --- a/tests/python/contrib/test_ethosn/test_topologies.py +++ b/tests/python/contrib/test_ethosn/test_topologies.py @@ -31,6 +31,8 @@ @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) def test_split_add_concat(dtype): + """Test a model with split, add and contatenate.""" + def get_model(input_shape, dtype, var_names): """Return a model""" @@ -148,23 +150,25 @@ def get_model(dtype): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) def test_output_order(dtype): + """Test the output order.""" + def get_model(input_shape, dtype, var_names): """Return a model""" - min = np.iinfo(dtype).min - max = np.iinfo(dtype).max + min_value = np.iinfo(dtype).min + max_value = np.iinfo(dtype).max a = relay.var(next(var_names), shape=input_shape, dtype=dtype) - z = relay.op.clip(a, min, max) - b = relay.op.clip(z, min, min + 15) - c = relay.op.clip(z, min + 16, min + 31) - d = relay.op.clip(z, min + 32, min + 47) - e = relay.op.clip(z, min + 48, min + 63) - f = relay.op.clip(z, min + 64, min + 79) - g = relay.op.clip(z, min + 80, min + 95) - h = relay.op.clip(z, min + 96, min + 111) - i = relay.op.clip(z, min + 112, max) - return relay.Tuple((d, c, e, f, i, b, h, g)) + op_z = relay.op.clip(a, min_value, max_value) + op_b = relay.op.clip(op_z, min_value, min_value + 15) + op_c = relay.op.clip(op_z, min_value + 16, min_value + 31) + op_d = relay.op.clip(op_z, min_value + 32, min_value + 47) + op_e = relay.op.clip(op_z, min_value + 48, min_value + 63) + op_f = relay.op.clip(op_z, min_value + 64, min_value + 79) + op_g = relay.op.clip(op_z, min_value + 80, min_value + 95) + op_h = relay.op.clip(op_z, min_value + 96, min_value + 111) + op_i = relay.op.clip(op_z, min_value + 112, max_value) + return relay.Tuple((op_d, op_c, op_e, op_f, op_i, op_b, op_h, op_g)) np.random.seed(0) inputs = { @@ -190,6 +194,7 @@ def test_output_order_different_sizes(dtype): """ Test the output order when there are multiple outputs of different sizes. """ + np.random.seed(0) input_name = "a" input_shape = (1, 8, 8, 4) @@ -233,6 +238,8 @@ def get_model(): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) def test_split_with_asym_concats(dtype): + """Test a model with split and contatenates.""" + def get_model(shape, dtype, splits, axis): a = relay.var("a", shape=shape, dtype=dtype) split = relay.op.split(a, indices_or_sections=splits, axis=axis) @@ -335,6 +342,8 @@ def get_model(dtype): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) def test_input_tuples(dtype): + """Test a model with a tuple as input.""" + def get_model(shapes, dtype, axis): tup = [] for i, shape in enumerate(shapes): From 38ba8c0bb69dd76203a96ba6b2a5c067fe0b2ba0 Mon Sep 17 00:00:00 2001 From: sisleyli <43139237+sisleyli@users.noreply.github.com> Date: Thu, 1 Sep 2022 18:32:42 +0800 Subject: [PATCH 086/704] [Relay] Extract intermediate node by its expression ID (#12646) [Relay] Extract Intermediate Expr by relay expr ID for analysis modify doc comments Co-authored-by: Bin Li --- python/tvm/relay/analysis/analysis.py | 38 +++++ .../analysis/extract_intermediate_expr.cc | 88 ++++++++++++ ...test_analysis_extract_intermediate_expr.py | 130 ++++++++++++++++++ 3 files changed, 256 insertions(+) create mode 100644 src/relay/analysis/extract_intermediate_expr.cc create mode 100644 tests/python/relay/test_analysis_extract_intermediate_expr.py diff --git a/python/tvm/relay/analysis/analysis.py b/python/tvm/relay/analysis/analysis.py index 3b38c07a0a8a..12f659f0037c 100644 --- a/python/tvm/relay/analysis/analysis.py +++ b/python/tvm/relay/analysis/analysis.py @@ -431,3 +431,41 @@ def get_calibration_data(mod, data): calib_data[gvar] = value return calib_data + + +def extract_intermdeiate_expr(mod, expr_id): + """Extract Relay Expr by its expression ID + + This function is used for extracting Relay Expr + by its expression ID of the main function + that we can see in `print(mod["main"])`. + + Parameters + ---------- + mod : tvm.IRModule + + expr_id : the Expr ID that we want to extract + + Returns + ------- + ret : Extracted IRModule + + Examples + -------- + .. code-block:: python + + # Suppose our module is printed like this: + # def @main(%x: Tensor[(1, 1, 5, 1), float32], %w1, %w2) { + # %0 = nn.conv2d(%x, %w1, padding=[1, 1, 1, 1], channels=1, kernel_size=[3, 3]); + # %1 = nn.conv2d(%0, %w2, padding=[1, 1, 1, 1], channels=1, kernel_size=[3, 3]); + # %2 = add(%0, %1); + # %3 = split(%2, indices_or_sections=1); + # %4 = %3.0; + # add(%4, 1f) + # } + # if we want to extract `%1 = nn.conv2d` + from tvm import relay + + relay.analysis.extract_intermdeiate_expr(mod, 1) + """ + return _ffi_api.ExtractIntermediateExpr(mod, expr_id) diff --git a/src/relay/analysis/extract_intermediate_expr.cc b/src/relay/analysis/extract_intermediate_expr.cc new file mode 100644 index 000000000000..d7466e2729db --- /dev/null +++ b/src/relay/analysis/extract_intermediate_expr.cc @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file extract_intermediate_expr.cc + * \brief Used for extracting Relay Expr + by the expression ID of the main function + that we can see in `print(mod["main"])`. + */ +#include +#include +#include +#include + +namespace tvm { +namespace relay { + +class ExtractIntermediateExprWrapper : private MixedModeVisitor { + public: + explicit ExtractIntermediateExprWrapper(const IRModule& mod, const int expr_id) + : mod_(mod), target_expr_id_(expr_id), counter_(0) {} + + IRModule Extract() { + VisitExpr(this->mod_->Lookup("main")); + + // ensure the target expr_id we want to extract is valid. + ICHECK(target_expr_id_ >= 0 && target_expr_id_ < counter_); + + return IRModule::FromExpr(target_op_, {}); + } + + private: + using MixedModeVisitor::VisitExpr_; + + const IRModule mod_; + /*! \brief the expr id that we want to extract. */ + const int target_expr_id_; + int counter_; + Expr target_op_; + + void VisitExpr_(const CallNode* n) final { + CheckCounterAndIncrease(GetRef(n)); + MixedModeVisitor::VisitExpr_(n); + } + + void VisitExpr_(const TupleNode* n) final { + CheckCounterAndIncrease(GetRef(n)); + MixedModeVisitor::VisitExpr_(n); + } + + void VisitExpr_(const TupleGetItemNode* n) final { + CheckCounterAndIncrease(GetRef(n)); + MixedModeVisitor::VisitExpr_(n); + } + + void CheckCounterAndIncrease(const Expr& expr) { + if (target_expr_id_ == counter_) { + target_op_ = expr; + } + ++counter_; + } +}; + +IRModule ExtractIntermediateExprPacked(const IRModule& mod, const int expr_id) { + return ExtractIntermediateExprWrapper(mod, expr_id).Extract(); +} + +TVM_REGISTER_GLOBAL("relay.analysis.ExtractIntermediateExpr") + .set_body_typed(ExtractIntermediateExprPacked); + +} // namespace relay +} // namespace tvm diff --git a/tests/python/relay/test_analysis_extract_intermediate_expr.py b/tests/python/relay/test_analysis_extract_intermediate_expr.py new file mode 100644 index 000000000000..abcaf880b4aa --- /dev/null +++ b/tests/python/relay/test_analysis_extract_intermediate_expr.py @@ -0,0 +1,130 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Test function extraction""" +import pytest +import tvm +from tvm import relay + + +def get_conv_net(): + """This gets the net for: + conv2d + / | + / | + conv2d | + \ | + \ | + elemwise add + | + | + | + split + | + | + | + elemwise add + """ + dshape = (1, 1, 5, 1) + x = relay.var("x", shape=dshape) + y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1) + x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1) + + z = relay.add(y, x1) + + tuple_out = relay.op.split(z, indices_or_sections=1, axis=0) + + tuple_0_add = relay.add(tuple_out[0], relay.const(1, dtype="float32")) + + return tvm.IRModule.from_expr(tuple_0_add) + + +def get_conv2d(): + x = relay.var("x", shape=(1, 56, 56, 64)) + weight1 = relay.var("weight1", shape=(3, 3, 64, 32)) + y = relay.nn.conv2d( + x, + weight1, + channels=32, + kernel_size=(3, 3), + padding=(1, 1), + data_layout="NHWC", + kernel_layout="HWIO", + ) + return tvm.IRModule.from_expr(y) + + +def test_extract(): + dshape = (1, 1, 5, 1) + + def before(): + return get_conv_net() + + def expected_0(): + x = relay.var("x", shape=dshape) + y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1) + return tvm.IRModule.from_expr(y) + + def expected_1(): + x = relay.var("x", shape=dshape) + y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1) + x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1) + return tvm.IRModule.from_expr(x1) + + def expected_2(): + x = relay.var("x", shape=dshape) + y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1) + x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1) + z = relay.add(y, x1) + return tvm.IRModule.from_expr(z) + + def expected_3(): + x = relay.var("x", shape=dshape) + y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1) + x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1) + z = relay.add(y, x1) + tuple_out = relay.op.split(z, indices_or_sections=1, axis=0) + return tvm.IRModule.from_expr(tuple_out.astuple()) + + def expected_4(): + # check tuple node + x = relay.var("x", shape=dshape) + y = relay.nn.conv2d(x, relay.var("w1"), kernel_size=(3, 3), padding=(1, 1), channels=1) + x1 = relay.nn.conv2d(y, relay.var("w2"), kernel_size=(3, 3), padding=(1, 1), channels=1) + z = relay.add(y, x1) + tuple_out = relay.op.split(z, indices_or_sections=1, axis=0) + return tvm.IRModule.from_expr(tuple_out[0]) + + assert tvm.ir.structural_equal( + relay.analysis.extract_intermdeiate_expr(before(), 0), expected_0() + ) + assert tvm.ir.structural_equal( + relay.analysis.extract_intermdeiate_expr(before(), 1), expected_1() + ) + assert tvm.ir.structural_equal( + relay.analysis.extract_intermdeiate_expr(before(), 2), expected_2() + ) + assert tvm.ir.structural_equal( + (relay.analysis.extract_intermdeiate_expr(before(), 3)), expected_3() + ) + assert tvm.ir.structural_equal( + relay.analysis.extract_intermdeiate_expr(before(), 4), expected_4() + ) + assert tvm.ir.structural_equal(relay.analysis.extract_intermdeiate_expr(before(), 5), before()) + + +if __name__ == "__main__": + pytest.main([__file__]) From 038f15b5e204120709186a8791e5b49986060bb0 Mon Sep 17 00:00:00 2001 From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com> Date: Thu, 1 Sep 2022 16:31:54 +0300 Subject: [PATCH 087/704] [Hexagon] Implement fixed_point_multiply op through intrinsics. (#12659) This commit adds high-performance implementation of fixed_point_multiply operation based on Hexagon intrinsics for vmpye/vmpyo instructions. Benchmarking of 'fixed_point_multiply' op with (1,8,56,56,32) input tensor on Qualcomm SM8350: * default implementation: 10.06 ms * optimized implementation: 1.42 ms * speedup: 7x times (!!!) Please note that this is introducing a small round-up error for some corner cases with negative shift argument (The same as for ARM CPU, see PR#5980). This is because we are rounding twice instead than only once: * original q_multiply_shift: round(x*y*2^-s) * hexagon q_multiply_shift: round(round(x*y)*2^-s) --- python/tvm/topi/hexagon/__init__.py | 1 + python/tvm/topi/hexagon/injective.py | 7 +- python/tvm/topi/hexagon/tensor_intrin.py | 71 +++++++++ .../test_hexagon/test_fixed_point_multiply.py | 140 ++++++++++++++++++ 4 files changed, 216 insertions(+), 3 deletions(-) create mode 100644 python/tvm/topi/hexagon/tensor_intrin.py create mode 100644 tests/python/contrib/test_hexagon/test_fixed_point_multiply.py diff --git a/python/tvm/topi/hexagon/__init__.py b/python/tvm/topi/hexagon/__init__.py index dfe739288187..a3768a6e809e 100644 --- a/python/tvm/topi/hexagon/__init__.py +++ b/python/tvm/topi/hexagon/__init__.py @@ -26,4 +26,5 @@ from .pooling import * from .reduce import * from .resize2d import * +from .tensor_intrin import * from .qnn import * diff --git a/python/tvm/topi/hexagon/injective.py b/python/tvm/topi/hexagon/injective.py index 9ced0ac7d399..b1d1e1541961 100644 --- a/python/tvm/topi/hexagon/injective.py +++ b/python/tvm/topi/hexagon/injective.py @@ -19,6 +19,8 @@ import tvm +import numpy as np + def schedule_injective(outs): """Schedule for injective op. @@ -37,11 +39,10 @@ def schedule_injective(outs): outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs s = tvm.te.create_schedule([x.op for x in outs]) tvm.te.schedule.AutoInlineInjective(s) - - # Fuse axes and vectorize inner 128 elements + # Fuse axes and vectorize inner elements for x in outs: fused = s[x].fuse(*x.op.axis) - _, inner = s[x].split(fused, factor=128) + _, inner = s[x].split(fused, factor=128 // np.dtype(x.dtype).itemsize) s[x].vectorize(inner) return s diff --git a/python/tvm/topi/hexagon/tensor_intrin.py b/python/tvm/topi/hexagon/tensor_intrin.py new file mode 100644 index 000000000000..bdc63854328b --- /dev/null +++ b/python/tvm/topi/hexagon/tensor_intrin.py @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Optimized implementation of q_multiply_shift based on LLVM intrinsics""" + +import tvm +from tvm.ir import register_intrin_lowering + + +def _q_multiply_shift_hexagon(op): + """ + Implementation of q_multiply_shift through hexagon intrinsics vmpyewuh and vmpyowh when q == 31. + + Please note that this is introducing a small round-up error for some corner cases with negative + shift argument. This is because we are rounding twice instead than only once. I.e.: + + * original q_multiply_shift: round(x*y*2^-s) + * hexagon q_multiply_shift: round(round(x*y)*2^-s) + """ + x = op.args[0] + y = op.args[1] + fractional_bits = op.args[2] + shift = op.args[3] + + # Don't use this intrinsic if we don't have a int32x32 vector + # or if we are not multiplying q31 numbers + if x.dtype != "int32x32" or fractional_bits.value != 31: + return op + + # Case 1, shift is negative + mul_e_1 = tvm.tir.call_llvm_intrin( + op.dtype, "llvm.hexagon.V6.vmpyewuh.128B", tvm.tir.const(2, "uint32"), x, y + ) + mul_o_1 = tvm.tir.call_llvm_intrin( + op.dtype, "llvm.hexagon.V6.vmpyowh.rnd.sacc.128B", tvm.tir.const(3, "uint32"), mul_e_1, x, y + ) + fixup = mul_o_1 & (-shift) + round_mul = mul_o_1 + fixup + out_negative_shift = tvm.tir.call_llvm_intrin( + op.dtype, "llvm.hexagon.V6.vaslwv.128B", tvm.tir.const(2, "uint32"), round_mul, shift + ) + + # Case 2, shift is positive + x = x * (1 << (shift)) + mul_e_2 = tvm.tir.call_llvm_intrin( + op.dtype, "llvm.hexagon.V6.vmpyewuh.128B", tvm.tir.const(2, "uint32"), x, y + ) + mul_o_2 = tvm.tir.call_llvm_intrin( + op.dtype, "llvm.hexagon.V6.vmpyowh.rnd.sacc.128B", tvm.tir.const(3, "uint32"), mul_e_2, x, y + ) + + # Select depending on the shift + return tvm.tir.Select(shift < 0, out_negative_shift, mul_o_2) + + +register_intrin_lowering( + "tir.q_multiply_shift", target="hexagon", f=_q_multiply_shift_hexagon, level=99 +) diff --git a/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py new file mode 100644 index 000000000000..8ee04a649990 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_fixed_point_multiply.py @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm.testing +from tvm import relay +from tvm.relay.backend import Executor +from tvm.contrib.hexagon.session import Session + +import re +import numpy as np + + +@tvm.testing.requires_hexagon +def test_vmpy_intrinsic_presence(): + """ + check intrinsic lowering for fixed_point_multiply operation + """ + ishape = (1, 128) + a = relay.var("a", relay.TensorType(ishape, "int32")) + + y = relay.fixed_point_multiply(a, 1395864320, 1) # 1.3 + + relay_mod = tvm.IRModule.from_expr(y) + + params = {} + target_hexagon = tvm.target.hexagon("v68") + executor = Executor("graph", {"link-params": True}) + + with tvm.transform.PassContext(opt_level=3): + hexagon_lowered = tvm.relay.build( + relay_mod, + tvm.target.Target(target_hexagon, host=target_hexagon), + executor=executor, + params=params, + ) + + asm = hexagon_lowered.lib.get_source("asm") + + # Check that 'vmpye' instruction was generated in asm file. + vmpye_regex = re.compile(r"v\d{1,2}.w = vmpye\(v\d{1,2}.w,v\d{1,2}.uh\)") + assert vmpye_regex.search(asm) is not None + + # Check that 'vmpyo' instruction was generated in asm file. + vmpyo_regex = re.compile(r"v\d{1,2}.w \+= vmpyo\(v\d{1,2}.w,v\d{1,2}.h\):<<1:rnd:sat:shift") + assert vmpyo_regex.search(asm) is not None + + +def build_module(relay_mod, target): + params = {} + executor = Executor("graph", {"link-params": True}) + lowered = tvm.relay.build( + relay_mod, + tvm.target.Target(target, host=target), + executor=executor, + params=params, + ) + return lowered + + +def run_module(graph_mod, inputs): + graph_mod.set_input(**inputs) + graph_mod.run() + output = graph_mod.get_output(0).numpy() + return output + + +@tvm.testing.requires_hexagon +def test_fixed_point_multiply_positive_shift(hexagon_session: Session): + ishape = (6, 32) + a = relay.var("a", relay.TensorType(ishape, "int32")) + multiplier, shift = (1395864320, 1) # 1.3 + fpm = relay.fixed_point_multiply(a, multiplier, shift) + relay_mod = tvm.IRModule.from_expr(fpm) + + with tvm.transform.PassContext(opt_level=3): + # Compile for Hexagon... + hexagon_lowered = build_module(relay_mod, tvm.target.hexagon("v68")) + + # Compile for LLVM... + llvm_lowered = build_module(relay_mod, tvm.target.Target("llvm")) + + data_in = np.arange(-96, 96).reshape(ishape) + inputs = {"a": data_in} + + # Run hexagon... + graph_mod = hexagon_session.get_executor_from_factory(hexagon_lowered) + hexagon_output = run_module(graph_mod, inputs) + + # Run llvm... + llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0))) + expected_output = run_module(llvm_graph_mod, inputs) + + tvm.testing.assert_allclose(hexagon_output, expected_output) + + +@tvm.testing.requires_hexagon +def test_fixed_point_multiply_negative_shift(hexagon_session: Session): + ishape = (6, 32) + a = relay.var("a", relay.TensorType(ishape, "int32")) + multiplier, shift = (1288490240, -2) # 0.15 + fpm = relay.fixed_point_multiply(a, multiplier, shift) + relay_mod = tvm.IRModule.from_expr(fpm) + + with tvm.transform.PassContext(opt_level=3): + # Compile for Hexagon... + hexagon_lowered = build_module(relay_mod, tvm.target.hexagon("v68")) + + # Compile for LLVM... + llvm_lowered = build_module(relay_mod, tvm.target.Target("llvm")) + + data_in = np.arange(-96, 96).reshape(ishape) + inputs = {"a": data_in} + + # Run hexagon... + graph_mod = hexagon_session.get_executor_from_factory(hexagon_lowered) + hexagon_output = run_module(graph_mod, inputs) + + # Run llvm... + llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0))) + expected_output = run_module(llvm_graph_mod, inputs) + + tvm.testing.assert_allclose(hexagon_output, expected_output, atol=1) + + +if __name__ == "__main__": + tvm.testing.main() From 32f9a5f4d4f03a0875d64ac42df46cafe8ae3cfa Mon Sep 17 00:00:00 2001 From: Yuanjing Shi Date: Thu, 1 Sep 2022 04:23:35 -1000 Subject: [PATCH 088/704] [MetaSchedule] Fix autoinline for single const consumer block (#12668) fix autoinline and add test --- .../schedule_rule/auto_inline.cc | 5 +++- ...meta_schedule_schedule_rule_auto_inline.py | 28 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/meta_schedule/schedule_rule/auto_inline.cc b/src/meta_schedule/schedule_rule/auto_inline.cc index 76313f46d1c8..446c8ead7e8e 100644 --- a/src/meta_schedule/schedule_rule/auto_inline.cc +++ b/src/meta_schedule/schedule_rule/auto_inline.cc @@ -104,7 +104,10 @@ inline InlineType AutoInlineNode::CheckInline(const tir::Schedule& sch, } // Cond 2. For a block that generates a constant tensor, ignore all other conditions if (inline_const_tensor && block->reads.empty()) { - return InlineType::kInlineIntoConsumer; + Array consumer_srefs = GetConsumers(state, block_sref); + if (!consumer_srefs.empty() && CanComputeInline(state, block_sref)) { + return InlineType::kInlineIntoConsumer; + } } // Cond 3. The block doesn't contain any disallowed operators if (!is_pure_sptial && !disallow_op.empty() && HasOp(realize, disallow_op)) { diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py index a8ffa6ff9d3f..fcf6a8571b7f 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py @@ -320,6 +320,21 @@ def main(placeholder: T.Buffer[(1, 384), "int64"], placeholder_1: T.Buffer[(3052 T.writes(T_add[ax0, ax1, ax2]) T_add[ax0, ax1, ax2] = placeholder_1[T.min(T.max(T.int64(0), T.Select(T.cast(placeholder[ax0, ax1] < T.int64(0), "int32") != 0, placeholder[ax0, ax1] + T.int64(30522), placeholder[ax0, ax1])), T.int64(30521)), ax2] + placeholder_2[ax0, ax1, ax2] +@tvm.script.ir_module +class ConstConsumer: + @T.prim_func + def main(T_full: T.Buffer[(1, 12, 4096), "int64"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + for i0, i1, i2 in T.grid(1, 12, 4096): + with T.block("T_full"): + ax0, ax1, ax2 = T.axis.remap("SSS", [i0, i1, i2]) + T.reads() + T.writes(T_full[ax0, ax1, ax2]) + T_full[ax0, ax1, ax2] = T.int64(0) + # pylint: enable=no-member,invalid-name,unused-variable,no-self-argument,line-too-long,chained-comparison,not-callable,too-many-nested-blocks # fmt: on @@ -383,8 +398,21 @@ def test_inline_pure_spatial(): tvm.ir.assert_structural_equal(lhs=space.mod, rhs=AfterPureSpatial) +def test_inline_constant_tensor(): + mod = ConstConsumer + target = Target("cuda", host="llvm") + ctx = _create_context( + mod=mod, + target=target, + rule=auto_inline(target=target), + ) + (space,) = ctx.space_generator.generate_design_space(mod=mod) + tvm.ir.assert_structural_equal(lhs=space.mod, rhs=ConstConsumer) + + if __name__ == "__main__": test_inline_consumer_chain() test_inline_into_cache() test_inline_into_multiple_consumers() test_inline_pure_spatial() + test_inline_constant_tensor() From effcd2251b4bb04e47f8ec288b056b0756ea4f4f Mon Sep 17 00:00:00 2001 From: Robert Kimball Date: Thu, 1 Sep 2022 08:57:40 -0700 Subject: [PATCH 089/704] Add methods to get and set late-bound constants. (#12664) * Add methods to read and restore late-bound constants on Executable. * Add bindings for new functions * Cleanup * Fix function name * Add tests for python API to access new load/save functions * Add another tests for python API to access new load/save functions where there are no constants --- include/tvm/runtime/vm/executable.h | 13 +++++ python/tvm/runtime/vm.py | 10 ++++ src/runtime/vm/executable.cc | 24 ++++++++- tests/python/relay/test_vm.py | 80 +++++++++++++++++++++++++++++ 4 files changed, 126 insertions(+), 1 deletion(-) diff --git a/include/tvm/runtime/vm/executable.h b/include/tvm/runtime/vm/executable.h index 2405b3c0ba8c..fdbc1769c353 100644 --- a/include/tvm/runtime/vm/executable.h +++ b/include/tvm/runtime/vm/executable.h @@ -126,6 +126,11 @@ class TVM_DLL Executable : public ModuleNode { */ void MoveLateBoundConstantsToFile(const std::string& path, size_t byte_limit); + /*! + * \brief Get a map of all constants with larger that byte_limit in size. + */ + Map GetLateBoundConstants(size_t byte_limit); + /*! * \brief Restores the late-bound constants for the executable (if any) from given byte-stream. * @@ -134,6 +139,14 @@ class TVM_DLL Executable : public ModuleNode { */ void LoadLateBoundConstantsFromStream(dmlc::Stream* stream); + /*! + * \brief Restores the late-bound constants for the executable (if any) from given map. + * + * Must be called after \p Load but before any other methods if \p MoveLateBoundConstantsToBinary + * was used when saving. Otherwise can be ignored. + */ + void LoadLateBoundConstantsFromMap(Map map); + /*! * \brief As for \p LoadLateBoundConstantsFromStream, but load from file at \p path. */ diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py index c065d77a7c9f..615f66fdcc1c 100644 --- a/python/tvm/runtime/vm.py +++ b/python/tvm/runtime/vm.py @@ -86,7 +86,9 @@ def __init__(self, mod): self._get_function_arity = self.mod["get_function_arity"] self._get_function_param_name = self.mod["get_function_param_name"] self._move_late_bound_consts = self.mod["move_late_bound_consts"] + self._get_late_bound_consts = self.mod["get_late_bound_consts"] self._load_late_bound_consts = self.mod["load_late_bound_consts"] + self._load_late_bound_consts_from_map = self.mod["load_late_bound_consts_from_map"] def save(self): """Save the Relay VM Executable. @@ -312,10 +314,18 @@ def move_late_bound_consts(self, path, byte_limit): """Move all constants of byte size greater or equal to byte_limit to file at path""" return self._move_late_bound_consts(path, byte_limit) + def get_late_bound_consts(self, byte_limit): + """Return all constants of byte size greater or equal to byte_limit""" + return self._get_late_bound_consts(byte_limit) + def load_late_bound_consts(self, path): """Re-load constants previously saved to file at path""" return self._load_late_bound_consts(path) + def load_late_bound_consts_from_map(self, map): + """Re-load constants supplied in map""" + return self._load_late_bound_consts_from_map(map) + class VirtualMachine(object): """Relay VM runtime. diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc index 85dad2839a8a..2484ece3081d 100644 --- a/src/runtime/vm/executable.cc +++ b/src/runtime/vm/executable.cc @@ -97,12 +97,25 @@ PackedFunc Executable::GetFunction(const std::string& name, const ObjectPtr(byte_limit)); }); + } else if (name == "get_late_bound_consts") { + return PackedFunc([this](TVMArgs args, TVMRetValue* rv) { + CHECK_EQ(args.size(), 1); + uint64_t byte_limit = args[0]; + Map consts = GetLateBoundConstants(static_cast(byte_limit)); + *rv = consts; + }); } else if (name == "load_late_bound_consts") { return PackedFunc([this](TVMArgs args, TVMRetValue* rv) { CHECK_EQ(args.size(), 1); std::string path = args[0]; LoadLateBoundConstantsFromFile(path); }); + } else if (name == "load_late_bound_consts_from_map") { + return PackedFunc([this](TVMArgs args, TVMRetValue* rv) { + CHECK_EQ(args.size(), 1); + Map map = args[0]; + LoadLateBoundConstantsFromMap(map); + }); } else { LOG(FATAL) << "Unknown packed function: " << name; return PackedFunc(); @@ -300,7 +313,7 @@ void Executable::SaveVirtualDevicesSection(dmlc::Stream* strm) { strm->Write(host_device_index); } -void Executable::MoveLateBoundConstantsToStream(dmlc::Stream* stream, size_t byte_limit) { +Map Executable::GetLateBoundConstants(size_t byte_limit) { ICHECK(late_bound_constant_names.empty()); late_bound_constant_names.reserve(constants.size()); Map map; @@ -323,6 +336,11 @@ void Executable::MoveLateBoundConstantsToStream(dmlc::Stream* stream, size_t byt } VLOG(1) << "moved " << map.size() << " constants of " << total_late_bound_bytes << " bytes (out of " << constants.size() << " overall) to be late-bound"; + return map; +} + +void Executable::MoveLateBoundConstantsToStream(dmlc::Stream* stream, size_t byte_limit) { + Map map = GetLateBoundConstants(byte_limit); runtime::SaveParams(stream, map); } @@ -341,6 +359,10 @@ void Executable::LoadLateBoundConstantsFromStream(dmlc::Stream* stream) { ICHECK_EQ(late_bound_constant_names.size(), constants.size()); Map map = runtime::LoadParams(stream); VLOG(1) << "loaded " << map.size() << " late-bound constants"; + LoadLateBoundConstantsFromMap(map); +} + +void Executable::LoadLateBoundConstantsFromMap(Map map) { for (size_t const_index = 0; const_index < constants.size(); ++const_index) { if (!late_bound_constant_names[const_index].defined()) { ICHECK(constants[const_index].defined()) diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py index 4f649ad9beba..0b62db85c904 100644 --- a/tests/python/relay/test_vm.py +++ b/tests/python/relay/test_vm.py @@ -1405,5 +1405,85 @@ def test_vm_save_and_load_without_designating_late_bound_consts(): tvm.testing.assert_allclose(expected, actual.numpy()) +def test_load_and_save_constants_via_map(): + """Large constants can be serialized outside of executable""" + target = tvm.target.Target("llvm") + dev = tvm.cpu() + + # fn(x) { add(x, ) } + x = relay.var("x", shape=(1000, 1000)) + const_data = np.random.rand(1000, 1000).astype("float32") + const = relay.const(const_data, dtype="float32") + func = relay.Function([x], relay.op.add(x, const)) + mod = tvm.IRModule.from_expr(func) + + # Compile to executable. + vm_exec = vm.compile(mod, target=target) + + consts_map = vm_exec.get_late_bound_consts(byte_limit=256) + + # Save to constants and library files + temp = utils.tempdir() + path_dso = temp.relpath("lib.so") + vm_exec.mod.export_library(path_dso) + + # Load library files and constants + mod = runtime.load_module(path_dso) + mod["load_late_bound_consts_from_map"](consts_map) + + # Test main + x_data = np.random.rand(1000, 1000).astype("float32") + the_vm = runtime.vm.VirtualMachine(mod, dev) + actual = the_vm.invoke("main", x_data) + expected = x_data + const_data + tvm.testing.assert_allclose(expected, actual.numpy()) + + # We load the mod again so it's missing the consts. + mod = runtime.load_module(path_dso) + exe = runtime.vm.Executable(mod) + + # Also test loading consts via the VM's wrapper API. + exe.load_late_bound_consts_from_map(consts_map) + + # Test main again with consts now loaded via the above API. + x_data = np.random.rand(1000, 1000).astype("float32") + the_vm = runtime.vm.VirtualMachine(exe, dev) + actual = the_vm.invoke("main", x_data) + expected = x_data + const_data + tvm.testing.assert_allclose(expected, actual.numpy()) + + +def test_load_late_bound_consts_via_map_with_no_late_bound_consts(): + """Check that load_late_bound_consts handles a model with no late bound consts.""" + target = tvm.target.Target("llvm") + dev = tvm.cpu() + + const_data = np.random.rand(1).astype("float64") + x = relay.var("x", shape=(1,), dtype="float64") + const = relay.const(const_data, dtype="float64") + + func = relay.Function([x], relay.op.add(x, const)) + mod = tvm.IRModule.from_expr(func) + + vm_exec = vm.compile(mod, target=target) + + temp = utils.tempdir() + path_dso = temp.relpath("lib.so") + + # Ensure const_data is below the byte threshold for a late-bound const. + byte_limit = len(const_data.tobytes()) + 1 + consts_map = vm_exec.get_late_bound_consts(byte_limit=byte_limit) + vm_exec.mod.export_library(path_dso) + + mod = runtime.load_module(path_dso) + mod["load_late_bound_consts_from_map"](consts_map) + + x_data = np.random.rand(1).astype("float64") + loaded_vm = runtime.vm.VirtualMachine(mod, dev) + actual = loaded_vm.invoke("main", x_data) + expected = x_data + const_data + tvm.testing.assert_allclose(expected, actual.numpy()) + + if __name__ == "__main__": tvm.testing.main() From e814f798edc5bf6977a4f4f74ec8d1d7e363c608 Mon Sep 17 00:00:00 2001 From: Andrey Malyshev Date: Thu, 1 Sep 2022 19:33:15 +0300 Subject: [PATCH 090/704] [Adreno] Change compute/schedule for ToMixedPrecision pass (#12537) * [Adreno] Change compute/schedule for ToMixedPrecision pass * Address CI fails * address PR comments * Fix AutoTVM flow --- python/tvm/relay/op/strategy/adreno.py | 142 ++++++------------ python/tvm/topi/adreno/conv2d_alter_op.py | 48 +++--- python/tvm/topi/adreno/conv2d_nchw.py | 117 +++++++-------- .../tvm/topi/adreno/conv2d_nchw_winograd.py | 45 +----- python/tvm/topi/adreno/conv2d_nhwc.py | 111 +++++++------- .../tvm/topi/adreno/conv2d_nhwc_winograd.py | 45 +----- .../tvm/topi/adreno/conv2d_winograd_common.py | 19 +-- .../tvm/topi/adreno/depthwise_conv2d_nchw.py | 42 +----- .../tvm/topi/adreno/depthwise_conv2d_nhwc.py | 38 +---- .../python/relay/test_conv2d_nchw_texture.py | 4 +- .../python/relay/test_conv2d_nhwc_texture.py | 2 +- 11 files changed, 218 insertions(+), 395 deletions(-) diff --git a/python/tvm/relay/op/strategy/adreno.py b/python/tvm/relay/op/strategy/adreno.py index a537fa1e7b90..9429fd71e1d9 100644 --- a/python/tvm/relay/op/strategy/adreno.py +++ b/python/tvm/relay/op/strategy/adreno.py @@ -36,8 +36,10 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target): raise ValueError("dilation should be positive value") if groups == 1: - if (data_layout == "NCHW" and kernel_layout == "OIHW") or ( - data_layout == "NCHW4c" and kernel_layout == "OIHW4o" + if ( + (data_layout == "NCHW" and kernel_layout == "OIHW") + or (data_layout == "NCHW4c" and kernel_layout == "OIHW4o") + or (data_layout == "NCHW" and kernel_layout == "OIHW4o") ): if len(kernel.shape) == 4: _, _, kh, kw = get_const_tuple(kernel.shape) @@ -47,35 +49,24 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target): (2 < kh < 8 and 2 < kw < 8 and kh == kw) and (stride_h == 1 and stride_w == 1) and (dilation_h == 1 and dilation_w == 1) + and not (data_layout == "NCHW" and kernel_layout == "OIHW4o") ): - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd), - name="conv2d_nchw_winograd.image2d", - plevel=5, - ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_acc32), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd_acc32), - name="conv2d_nchw_winograd_acc32.image2d", - plevel=7, - ) - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nchwc), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc), - name="conv2d_nchwc.image2d", - plevel=10, + wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd), + name="conv2d_nchw_winograd.image2d", + plevel=5, ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nchwc_acc32), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc_acc32), - name="conv2d_nchwc_acc32.image2d", - plevel=20, + wrap_compute_conv2d(topi.adreno.conv2d_nchwc), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc), + name="conv2d_nchwc.image2d", + plevel=10, ) - elif (data_layout == "NHWC" and kernel_layout == "HWIO") or ( - data_layout == "NHWC4c" and kernel_layout == "HWIO4o" + elif ( + (data_layout == "NHWC" and kernel_layout == "HWIO") + or (data_layout == "NHWC4c" and kernel_layout == "HWIO4o") + or (data_layout == "NHWC" and kernel_layout == "HWIO4o") ): if len(kernel.shape) == 4: kh, kw, _, _ = get_const_tuple(kernel.shape) @@ -85,32 +76,19 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target): (2 < kh < 8 and 2 < kw < 8 and kh == kw) and (stride_h == 1 and stride_w == 1) and (dilation_h == 1 and dilation_w == 1) + and not (data_layout == "NHWC" and kernel_layout == "HWIO4o") ): - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd), - name="conv2d_nhwc_winograd.image2d", - plevel=5, - ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_acc32), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd_acc32), - name="conv2d_nhwc_winograd_acc32.image2d", - plevel=7, - ) - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nhwc), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc), - name="conv2d_nhwc.image2d", - plevel=10, + wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd), + name="conv2d_nhwc_winograd.image2d", + plevel=5, ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nhwc_acc32), - wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_acc32), - name="conv2d_nhwc_acc32.image2d", - plevel=20, + wrap_compute_conv2d(topi.adreno.conv2d_nhwc), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc), + name="conv2d_nhwc.image2d", + plevel=10, ) else: raise RuntimeError( @@ -149,35 +127,21 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target): if (data_layout == "NCHW" and kernel_layout == "OIHW") or ( data_layout == "NCHW4c" and kernel_layout == "OIHW4o" ): - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nchwc), - wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nchwc), - name="depthwise_conv2d_nchwc.image2d", - plevel=10, - ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nchwc_acc32), - wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nchwc_acc32), - name="depthwise_conv2d_nchwc_acc32.image2d", - plevel=20, + wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nchwc), + wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nchwc), + name="depthwise_conv2d_nchwc.image2d", + plevel=10, ) elif (data_layout == "NHWC" and kernel_layout == "HWOI") or ( data_layout == "NHWC4c" and kernel_layout == "HWOI4o" ): if data.shape[-1] >= 4: - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nhwc), - wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nhwc), - name="depthwise_conv2d_nhwc.image2d", - plevel=10, - ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nhwc_acc32), - wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nhwc_acc32), - name="depthwise_conv2d_nhwc_acc32.image2d", - plevel=20, + wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nhwc), + wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nhwc), + name="depthwise_conv2d_nhwc.image2d", + plevel=10, ) else: strategy.add_implementation( @@ -208,40 +172,18 @@ def conv2d_winograd_without_weight_transfrom_strategy_adreno(attrs, inputs, out_ assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout in ("NCHW", "NCHW4c"): - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform), - wrap_topi_schedule( - topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform - ), - name="conv2d_nchw_winograd_without_weight_transform.image2d", - plevel=5, - ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform_acc32), - wrap_topi_schedule( - topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform_acc32 - ), - name="conv2d_nchw_winograd_without_weight_transform_acc32.image2d", - plevel=7, + wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform), + name="conv2d_nchw_winograd_without_weight_transform.image2d", + plevel=5, ) elif layout in ("NHWC", "NHWC4c"): - if out_type.dtype == "float16": - strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform), - wrap_topi_schedule( - topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform - ), - name="conv2d_nhwc_winograd_without_weight_transform.image2d", - plevel=5, - ) strategy.add_implementation( - wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform_acc32), - wrap_topi_schedule( - topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform_acc32 - ), - name="conv2d_nhwc_winograd_without_weight_transform_acc32.image2d", - plevel=7, + wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform), + wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform), + name="conv2d_nhwc_winograd_without_weight_transform.image2d", + plevel=5, ) else: raise RuntimeError( diff --git a/python/tvm/topi/adreno/conv2d_alter_op.py b/python/tvm/topi/adreno/conv2d_alter_op.py index 16573991e09c..6cf749a62b27 100644 --- a/python/tvm/topi/adreno/conv2d_alter_op.py +++ b/python/tvm/topi/adreno/conv2d_alter_op.py @@ -304,7 +304,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): num_filter_block = 4 # no support yet for tensors that cannot be divisible by factor 4 - if in_channel_block != 4 or num_filter_block != 4: + if num_filter_block != 4: return None batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) @@ -312,16 +312,22 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): # update new attrs new_attrs["channels"] = out_channel - new_attrs["data_layout"] = "NCHW%dc" % in_channel_block + if in_channel_block == 4: + new_attrs["data_layout"] = "NCHW%dc" % in_channel_block + else: + new_attrs["data_layout"] = "NCHW" # (oc, ic, h, w) -> (OC, ic, h, w, oc) new_attrs["kernel_layout"] = "OIHW%do" % num_filter_block new_attrs["out_layout"] = "NCHW%dc" % num_filter_block # Store altered operator's config for applying of tuned AutoTVM statistics - new_data = te.placeholder( - (batch_size, in_channel // in_channel_block, height, width, in_channel_block), - dtype=data_dtype, - ) + if in_channel_block == 4: + new_data = te.placeholder( + (batch_size, in_channel // in_channel_block, height, width, in_channel_block), + dtype=data_dtype, + ) + else: + new_data = data_tensor new_kernel = te.placeholder( (out_channel // num_filter_block, in_filter_channel, kh, kw, num_filter_block), dtype=kernel_tensor.dtype, @@ -361,12 +367,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): num_filter_block = 4 # no support yet for tensors cannot be divisible by factor 4 - if in_channel_block != 4 or num_filter_block != 4: + if num_filter_block != 4: return None # update new attrs new_attrs["channels"] = out_channles - new_attrs["data_layout"] = "NHWC%dc" % in_channel_block + if in_channel_block == 4: + new_attrs["data_layout"] = "NHWC%dc" % in_channel_block + else: + new_attrs["data_layout"] = "NHWC" # (h, w, ic, oc) -> (h, w, ic, OC, oc) if kernel_layout == "HWIO": new_attrs["kernel_layout"] = "HWIO%do" % num_filter_block @@ -375,16 +384,19 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): new_attrs["out_layout"] = "NHWC%dc" % num_filter_block # Store altered operator's config for applying of tuned AutoTVM statistics - new_data = te.placeholder( - ( - batch_size, - in_height, - in_width, - in_channels // in_channel_block, - in_channel_block, - ), - dtype=data_dtype, - ) + if in_channel_block == 4: + new_data = te.placeholder( + ( + batch_size, + in_height, + in_width, + in_channels // in_channel_block, + in_channel_block, + ), + dtype=data_dtype, + ) + else: + new_data = data_tensor if kernel_layout == "HWIO": new_kernel = te.placeholder( ( diff --git a/python/tvm/topi/adreno/conv2d_nchw.py b/python/tvm/topi/adreno/conv2d_nchw.py index 65cd8e0150a8..082f71364af8 100644 --- a/python/tvm/topi/adreno/conv2d_nchw.py +++ b/python/tvm/topi/adreno/conv2d_nchw.py @@ -33,48 +33,22 @@ ) -@autotvm.register_topi_compute("conv2d_nchwc.image2d") -def conv2d_nchwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): - """Compute conv2d with NCHWc layout""" - args = {"shared": False, "accumulator": "float16"} - return compute_conv2d_NCHWc_KCRSk( - data, kernel, strides, padding, dilation, out_dtype, args=args - ) - - -@autotvm.register_topi_compute("conv2d_nchwc_acc32.image2d") -def conv2d_nchwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): - """Compute conv2d with NCHWc layout""" - args = {"shared": False, "accumulator": "float32"} - return compute_conv2d_NCHWc_KCRSk( - data, kernel, strides, padding, dilation, out_dtype, args=args - ) - - @autotvm.register_topi_schedule("conv2d_nchwc.image2d") def schedule_conv2d_nchwc(cfg, outs): - return schedule_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc16") - - -@autotvm.register_topi_schedule("conv2d_nchwc_acc32.image2d") -def schedule_conv2d_nchwc_acc32(cfg, outs): - return schedule_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc32") - - -def schedule_conv2d_nchwc_impl(cfg, outs, tag): """Create the schedule for conv2d_nchw""" outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) def _callback(op): - if op.tag == tag: + if op.tag == "adreno_conv2d_latest_op": schedule_conv2d_NCHWc_KCRSk(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) return s -def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype, args): +@autotvm.register_topi_compute("conv2d_nchwc.image2d") +def conv2d_nchwc(cfg, Input, Filter, stride, padding, dilation, out_dtype): """ Convolution operator in NCHWc layout. Algo: @@ -109,18 +83,12 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty convert_from4d = False if len(Input.shape) == 4: batch, in_channels, in_height, in_width = Input.shape - out_channles, in_filter_channels, kernel_h, kernel_w = Filter.shape - in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4) - out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4) if autotvm.GLOBAL_SCOPE.in_tuning: dshape = (batch, in_channel_chunks, in_height, in_width, in_channel_block) Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder") - kshape = (out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block) - Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder") else: - convert_from4d = True Input = pack_input( Input, "NCHW", @@ -131,6 +99,18 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty in_height, in_width, ) + else: + batch, in_channel_chunks, in_height, in_width, in_channel_block = Input.shape + + if len(Filter.shape) == 4: + out_channles, in_filter_channels, kernel_h, kernel_w = Filter.shape + out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4) + + if autotvm.GLOBAL_SCOPE.in_tuning: + kshape = (out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block) + Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder") + else: + convert_from4d = True Filter = pack_filter( Filter, "OIHW", @@ -144,9 +124,7 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty kernel_h, kernel_w, ) - else: - batch, in_channel_chunks, in_height, in_width, in_channel_block = Input.shape out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block = Filter.shape out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions( @@ -178,7 +156,7 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty ( temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb] * Filter[ffc, rcc * in_channel_block + rcb, ry, rx, ffb] - ).astype(args["accumulator"]), + ).astype(out_dtype), axis=[rcc, rcb, ry, rx], ), tag="conv2d_nchwc", @@ -193,13 +171,13 @@ def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dty return te.compute( (batch, out_channles, out_height_orig, out_width_orig), lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % out_channel_block], - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_conv2d_latest_op", ) else: return te.compute( (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block), lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype), - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_conv2d_latest_op", ) @@ -234,6 +212,20 @@ def schedule_conv2d_NCHWc_KCRSk(cfg, s, output): conv = output.op.input_tensors[0] latest_blocked = latest + pad_data, kernel = s[conv].op.input_tensors + filter_pack_rt = bool( + isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag + ) + + if "pad_temp" in pad_data.op.name: + input_pad_temp = pad_data.op.input_tensors[0] + else: + input_pad_temp = pad_data + + input_pack_rt = bool( + isinstance(input_pad_temp.op, tvm.te.ComputeOp) and "input_pack" in input_pad_temp.op.tag + ) + ##### space definition begin ##### n, fc, y, x, fb = s[conv].op.axis rcc, rcb, ry, rx = s[conv].op.reduce_axis @@ -274,37 +266,40 @@ def schedule_conv2d_NCHWc_KCRSk(cfg, s, output): ##### space definition end ##### pad_data, kernel = s[conv].op.input_tensors - if ( - isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag - ): # len(latest.op.axis) == 4: - # manage scheduling of datacopy - pad_data, kernel = s[conv].op.input_tensors - if "pad_temp" in pad_data.op.name: - pack_data = pad_data.op.input_tensors[0] - bind_data_copy(s[pack_data]) + # There are several conditions that have to be handled: + # 1. If we are in the tuning, we always add cache read for data to main conv kernel + # to get texture in tuning opencl kernel + # 2. If we are repacking input in runtime, we should always explicit schedule this one more + # stage of data copy from 4d to 5d (referred as pack_data). + # 3. If we have pad (independently if we have runtime repack or not) we should inline it in the + # cache_read("texture") + if autotvm.GLOBAL_SCOPE.in_tuning or input_pack_rt: + if autotvm.GLOBAL_SCOPE.in_tuning: + if "pad_temp" in pad_data.op.name: + s[pad_data].compute_inline() else: - bind_data_copy(s[pad_data]) - bind_data_copy(s[kernel]) - - pad_data, kernel = s[conv].op.input_tensors + if "pad_temp" in pad_data.op.name: + pack_data = pad_data.op.input_tensors[0] + bind_data_copy(s[pack_data]) + s[pad_data].compute_inline() + else: + pack_data = pad_data + bind_data_copy(s[pack_data]) - if ( - autotvm.GLOBAL_SCOPE.in_tuning - or isinstance(kernel.op, tvm.te.ComputeOp) - and "filter_pack" in kernel.op.tag - ): - if "pad_temp" in pad_data.op.name: - s[pad_data].compute_inline() AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) bind_data_copy(s[AT]) - WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) - bind_data_copy(s[WT]) elif "pad_temp" in pad_data.op.name: s[pad_data].compute_inline() # create cache stage AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) bind_data_copy(s[AT]) + if autotvm.GLOBAL_SCOPE.in_tuning or filter_pack_rt: + if not autotvm.GLOBAL_SCOPE.in_tuning: + bind_data_copy(s[kernel]) + WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) + bind_data_copy(s[WT]) + s[conv].set_scope("local") if latest_blocked == latest and output != latest: s[output].compute_inline() diff --git a/python/tvm/topi/adreno/conv2d_nchw_winograd.py b/python/tvm/topi/adreno/conv2d_nchw_winograd.py index 16f7cb8b19d9..0ddc0e7f2c0d 100644 --- a/python/tvm/topi/adreno/conv2d_nchw_winograd.py +++ b/python/tvm/topi/adreno/conv2d_nchw_winograd.py @@ -27,62 +27,32 @@ @autotvm.register_topi_compute("conv2d_nchw_winograd.image2d") def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype): - args = {"shared": False, "accumulator": "float16"} return conv2d_nchw_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False - ) - - -@autotvm.register_topi_compute("conv2d_nchw_winograd_acc32.image2d") -def conv2d_nchw_winograd_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype): - args = {"shared": False, "accumulator": "float32"} - return conv2d_nchw_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False ) @autotvm.register_topi_schedule("conv2d_nchw_winograd.image2d") def schedule_conv2d_nchw_winograd(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16") - - -@autotvm.register_topi_schedule("conv2d_nchw_winograd_acc32.image2d") -def schedule_conv2d_nchw_winograd_acc32(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32") + return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at") @autotvm.register_topi_compute("conv2d_nchw_winograd_without_weight_transform.image2d") def conv2d_nchw_winograd_without_weight_transform( cfg, data, kernel, strides, padding, dilation, out_dtype ): - args = {"shared": False, "accumulator": "float16"} return conv2d_nchw_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True - ) - - -@autotvm.register_topi_compute("conv2d_nchw_winograd_without_weight_transform_acc32.image2d") -def conv2d_nchw_winograd_without_weight_transform_acc32( - cfg, data, kernel, strides, padding, dilation, out_dtype -): - args = {"shared": False, "accumulator": "float32"} - return conv2d_nchw_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True ) @autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform.image2d") def schedule_conv2d_nchw_winograd_without_weight_transform(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16", pre_computed=True) - - -@autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform_acc32.image2d") -def schedule_conv2d_nchw_winograd_without_weight_transform_acc32(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32", pre_computed=True) + return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at", pre_computed=True) def conv2d_nchw_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed ): """Compute declaration for winograd @@ -111,9 +81,6 @@ def conv2d_nchw_winograd_comp( out_dtype: str The output type. This is used for mixed precision. - args: dict - Dictionary with additional arguments, e.g. accumulator type - pre_computed: bool Flag if weights were pre computed if true or the weights should be computed in runtime @@ -124,5 +91,5 @@ def conv2d_nchw_winograd_comp( 4-D or 5-D with shape NCHW or NCHW4c """ return conv2d_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, "NCHW" + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed, "NCHW" ) diff --git a/python/tvm/topi/adreno/conv2d_nhwc.py b/python/tvm/topi/adreno/conv2d_nhwc.py index b377169ca8c9..993b63252531 100644 --- a/python/tvm/topi/adreno/conv2d_nhwc.py +++ b/python/tvm/topi/adreno/conv2d_nhwc.py @@ -33,44 +33,22 @@ ) -@autotvm.register_topi_compute("conv2d_nhwc.image2d") -def conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): - """Compute conv2d with NCHWc layout""" - args = {"shared": False, "accumulator": "float16"} - return compute_conv2d_NHWC_HWIO(data, kernel, strides, padding, dilation, out_dtype, args=args) - - -@autotvm.register_topi_compute("conv2d_nhwc_acc32.image2d") -def conv2d_nhwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): - """Compute conv2d with NCHWc layout""" - args = {"shared": False, "accumulator": "float32"} - return compute_conv2d_NHWC_HWIO(data, kernel, strides, padding, dilation, out_dtype, args=args) - - @autotvm.register_topi_schedule("conv2d_nhwc.image2d") def schedule_conv2d_nhwc(cfg, outs): - return schedule_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc16") - - -@autotvm.register_topi_schedule("conv2d_nhwc_acc32.image2d") -def schedule_conv2d_nhwc_acc32(cfg, outs): - return schedule_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc32") - - -def schedule_conv2d_nhwc_impl(cfg, outs, tag): """Create the schedule for conv2d_nhwc""" outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) def _callback(op): - if op.tag == tag: + if op.tag == "adreno_conv2d_latest_op": schedule_conv2d_NHWC(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) return s -def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype, args): +@autotvm.register_topi_compute("conv2d_nhwc.image2d") +def conv2d_nhwc(cfg, Input, Filter, stride, padding, dilation, out_dtype): """ Convolution operator in NHWC layout. Algo: @@ -105,18 +83,12 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype convert_from4d = False if len(Input.shape) == 4: batch, in_height, in_width, in_channels = Input.shape - kernel_h, kernel_w, in_filter_channels, out_channles = Filter.shape - in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4) - out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4) if autotvm.GLOBAL_SCOPE.in_tuning: dshape = (batch, in_height, in_width, in_channel_chunks, in_channel_block) Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder") - kshape = (kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block) - Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder") else: - convert_from4d = True Input = pack_input( Input, "NHWC", @@ -127,6 +99,17 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype in_height, in_width, ) + else: + batch, in_height, in_width, in_channel_chunks, in_channel_block = Input.shape + + if len(Filter.shape) == 4: + kernel_h, kernel_w, in_filter_channels, out_channles = Filter.shape + out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4) + if autotvm.GLOBAL_SCOPE.in_tuning: + kshape = (kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block) + Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder") + else: + convert_from4d = True Filter = pack_filter( Filter, "HWIO", @@ -140,9 +123,7 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype kernel_h, kernel_w, ) - else: - batch, in_height, in_width, in_channel_chunks, in_channel_block = Input.shape kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block = Filter.shape out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions( @@ -173,7 +154,7 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype ( temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcc, rcb] * Filter[ry, rx, rcc * in_channel_block + rcb, fc, fb] - ).astype(args["accumulator"]), + ).astype(out_dtype), axis=[ry, rx, rcc, rcb], ), tag="conv2d_nhwc", @@ -188,13 +169,13 @@ def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype return te.compute( (batch, out_height_orig, out_width_orig, out_channles), lambda n, y, x, c: dummy_cast[n, y, x, c // out_channel_block, c % out_channel_block], - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_conv2d_latest_op", ) else: return te.compute( (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block), lambda n, y, x, ffc, ffb: conv[n, y, x, ffc, ffb].astype(out_dtype), - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_conv2d_latest_op", ) @@ -229,6 +210,19 @@ def schedule_conv2d_NHWC(cfg, s, output): conv = output.op.input_tensors[0] latest_blocked = latest + pad_data, kernel = s[conv].op.input_tensors + filter_pack_rt = bool( + isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag + ) + + if "pad_temp" in pad_data.op.name: + input_pad_temp = pad_data.op.input_tensors[0] + else: + input_pad_temp = pad_data + + input_pack_rt = bool( + isinstance(input_pad_temp.op, tvm.te.ComputeOp) and "input_pack" in input_pad_temp.op.tag + ) ##### space definition begin ##### n, y, x, fc, fb = s[conv].op.axis ry, rx, rcc, rcb = s[conv].op.reduce_axis @@ -270,37 +264,40 @@ def schedule_conv2d_NHWC(cfg, s, output): ##### space definition end ##### pad_data, kernel = s[conv].op.input_tensors - if ( - isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag - ): # len(latest.op.axis) == 4: - # manage scheduling of datacopy - pad_data, kernel = s[conv].op.input_tensors - if "pad_temp" in pad_data.op.name: - pack_data = pad_data.op.input_tensors[0] - bind_data_copy(s[pack_data]) + # There are several conditions that have to be handled: + # 1. If we are in the tuning, we always add cache read for data to main conv kernel + # to get texture in tuning opencl kernel + # 2. If we are repacking input in runtime, we should always explicit schedule this one more + # stage of data copy from 4d to 5d (referred as pack_data). + # 3. If we have pad (independently if we have runtime repack or not) we should inline it in the + # cache_read("texture") + if autotvm.GLOBAL_SCOPE.in_tuning or input_pack_rt: + if autotvm.GLOBAL_SCOPE.in_tuning: + if "pad_temp" in pad_data.op.name: + s[pad_data].compute_inline() else: - bind_data_copy(s[pad_data]) - bind_data_copy(s[kernel]) - - pad_data, kernel = s[conv].op.input_tensors + if "pad_temp" in pad_data.op.name: + s[pad_data].compute_inline() + pack_data = pad_data.op.input_tensors[0] + bind_data_copy(s[pack_data]) + else: + pack_data = pad_data + bind_data_copy(s[pack_data]) - if ( - autotvm.GLOBAL_SCOPE.in_tuning - or isinstance(kernel.op, tvm.te.ComputeOp) - and "filter_pack" in kernel.op.tag - ): - if "pad_temp" in pad_data.op.name: - s[pad_data].compute_inline() AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) bind_data_copy(s[AT]) - WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) - bind_data_copy(s[WT]) elif "pad_temp" in pad_data.op.name: s[pad_data].compute_inline() # create cache stage AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv]) bind_data_copy(s[AT]) + if autotvm.GLOBAL_SCOPE.in_tuning or filter_pack_rt: + if not autotvm.GLOBAL_SCOPE.in_tuning: + bind_data_copy(s[kernel]) + WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv]) + bind_data_copy(s[WT]) + s[conv].set_scope("local") if latest_blocked == latest and output != latest: s[output].compute_inline() diff --git a/python/tvm/topi/adreno/conv2d_nhwc_winograd.py b/python/tvm/topi/adreno/conv2d_nhwc_winograd.py index bfe385f210a4..b055b388e1a7 100644 --- a/python/tvm/topi/adreno/conv2d_nhwc_winograd.py +++ b/python/tvm/topi/adreno/conv2d_nhwc_winograd.py @@ -27,62 +27,32 @@ @autotvm.register_topi_compute("conv2d_nhwc_winograd.image2d") def conv2d_nhwc_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype): - args = {"shared": False, "accumulator": "float16"} return conv2d_nhwc_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False - ) - - -@autotvm.register_topi_compute("conv2d_nhwc_winograd_acc32.image2d") -def conv2d_nhwc_winograd_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype): - args = {"shared": False, "accumulator": "float32"} - return conv2d_nhwc_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=False ) @autotvm.register_topi_schedule("conv2d_nhwc_winograd.image2d") def schedule_conv2d_nhwc_winograd(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16") - - -@autotvm.register_topi_schedule("conv2d_nhwc_winograd_acc32.image2d") -def schedule_conv2d_nhwc_winograd_acc32(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32") + return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at") @autotvm.register_topi_compute("conv2d_nhwc_winograd_without_weight_transform.image2d") def conv2d_nhwc_winograd_without_weight_transform( cfg, data, kernel, strides, padding, dilation, out_dtype ): - args = {"shared": False, "accumulator": "float16"} return conv2d_nhwc_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True - ) - - -@autotvm.register_topi_compute("conv2d_nhwc_winograd_without_weight_transform_acc32.image2d") -def conv2d_nhwc_winograd_without_weight_transform_acc32( - cfg, data, kernel, strides, padding, dilation, out_dtype -): - args = {"shared": False, "accumulator": "float32"} - return conv2d_nhwc_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed=True ) @autotvm.register_topi_schedule("conv2d_nhwc_winograd_without_weight_transform.image2d") def schedule_conv2d_nhwc_winograd_without_weight_transform(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16", pre_computed=True) - - -@autotvm.register_topi_schedule("conv2d_nhwc_winograd_without_weight_transform_acc32.image2d") -def schedule_conv2d_nhwc_winograd_without_weight_transform_acc32(cfg, outs): - return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32", pre_computed=True) + return schedule_conv2d_winograd_impl(cfg, outs, tag="dummy_compute_at", pre_computed=True) def conv2d_nhwc_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed ): """Compute declaration for winograd @@ -111,9 +81,6 @@ def conv2d_nhwc_winograd_comp( out_dtype: str The output type. This is used for mixed precision. - args: dict - Dictionary with additional arguments, e.g. accumulator type - pre_computed: bool Flag if weights were pre computed if true or the weights should be computed in runtime @@ -124,5 +91,5 @@ def conv2d_nhwc_winograd_comp( 4-D or 5-D with shape NCHW or NCHW4c """ return conv2d_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, "NHWC" + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed, "NHWC" ) diff --git a/python/tvm/topi/adreno/conv2d_winograd_common.py b/python/tvm/topi/adreno/conv2d_winograd_common.py index b0cec0f70280..501773ad46fa 100644 --- a/python/tvm/topi/adreno/conv2d_winograd_common.py +++ b/python/tvm/topi/adreno/conv2d_winograd_common.py @@ -35,7 +35,7 @@ def conv2d_winograd_comp( - cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, layout + cfg, data, kernel, strides, padding, dilation, out_dtype, pre_computed, layout ): """Compute declaration for winograd @@ -64,9 +64,6 @@ def conv2d_winograd_comp( out_dtype: str The output type. This is used for mixed precision. - args: dict - Dictionary with additional arguments, e.g. accumulator type - pre_computed: bool Flag if weights were pre computed if true or the weights should be computed in runtime @@ -186,7 +183,7 @@ def conv2d_winograd_comp( r = KW m = tile_size - A, B, G = winograd_transform_matrices(m, r, out_dtype) + A, B, G = winograd_transform_matrices(m, r, data.dtype) H = (H + pt + pb - KH) // HSTR + 1 W = (W + pl + pr - KW) // WSTR + 1 @@ -268,7 +265,7 @@ def conv2d_winograd_comp( lambda eps, nu, co, p, cob: te.sum( ( kernel_pack[eps][nu][ci * CB + cb][co][cob] * data_pack_trans[eps][nu][ci][p][cb] - ).astype(args["accumulator"]), + ).astype(out_dtype), axis=[ci, cb], ), name="bgemm", @@ -280,7 +277,7 @@ def conv2d_winograd_comp( inverse = te.compute( (CO, P, m, m, COB), lambda co, p, vh, vw, cob: te.sum( - bgemm[r_a][r_b][co][p][cob] * (A[r_a][vh] * A[r_b][vw]).astype(args["accumulator"]), + bgemm[r_a][r_b][co][p][cob] * (A[r_a][vh] * A[r_b][vw]).astype(out_dtype), axis=[r_a, r_b], ), name="inverse", @@ -295,7 +292,7 @@ def conv2d_winograd_comp( idxmod(h, m) ][idxmod(w, m)][c % CB].astype(out_dtype), name="output", - tag="cast_from_acc" + args["accumulator"][-2:], + tag="dummy_compute_at", ) else: output = te.compute( @@ -304,7 +301,7 @@ def conv2d_winograd_comp( n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m) ][idxmod(h, m)][idxmod(w, m)][cob].astype(out_dtype), name="output", - tag="cast_from_acc" + args["accumulator"][-2:], + tag="dummy_compute_at", ) else: if convert_from4d and autotvm.GLOBAL_SCOPE.in_tuning is False: @@ -314,7 +311,7 @@ def conv2d_winograd_comp( idxmod(h, m) ][idxmod(w, m)][c % CB].astype(out_dtype), name="output", - tag="cast_from_acc" + args["accumulator"][-2:], + tag="dummy_compute_at", ) else: output = te.compute( @@ -323,7 +320,7 @@ def conv2d_winograd_comp( n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m) ][idxmod(h, m)][idxmod(w, m)][cob].astype(out_dtype), name="output", - tag="cast_from_acc" + args["accumulator"][-2:], + tag="dummy_compute_at", ) if isinstance(N, int): diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py index 37713b4584b9..eb998bdbcd6e 100644 --- a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py +++ b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py @@ -33,50 +33,22 @@ ) -@autotvm.register_topi_compute("depthwise_conv2d_nchwc.image2d") -def depthwise_conv2d_nchwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): - """Compute depthwise_conv2d with NCHWc layout""" - args = {"shared": False, "accumulator": "float16"} - return compute_depthwise_conv2d_NCHWc_KCRSk( - data, kernel, strides, padding, dilation, out_dtype, args=args - ) - - -@autotvm.register_topi_compute("depthwise_conv2d_nchwc_acc32.image2d") -def depthwise_conv2d_nchwc_acc32( - cfg, data, kernel, strides, padding, dilation, out_dtype="float16" -): - """Compute depthwise_conv2d with NCHWc layout""" - args = {"shared": False, "accumulator": "float32"} - return compute_depthwise_conv2d_NCHWc_KCRSk( - data, kernel, strides, padding, dilation, out_dtype, args=args - ) - - @autotvm.register_topi_schedule("depthwise_conv2d_nchwc.image2d") def schedule_depthwise_conv2d_nchwc(cfg, outs): - return schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc16") - - -@autotvm.register_topi_schedule("depthwise_conv2d_nchwc_acc32.image2d") -def schedule_depthwise_conv2d_nchwc_acc32(cfg, outs): - return schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc32") - - -def schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag): """Create the schedule for depthwise conv2d_nchw4c_ohwi4o""" outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) def _callback(op): - if op.tag == tag: + if op.tag == "adreno_dw_conv2d_latest_op": schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) return s -def compute_depthwise_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype, args): +@autotvm.register_topi_compute("depthwise_conv2d_nchwc.image2d") +def depthwise_conv2d_nchwc(cfg, Input, Filter, stride, padding, dilation, out_dtype): """ Depthwise convolution operator in NCHWc layout. Algo: @@ -183,10 +155,10 @@ def compute_depthwise_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilatio ffb, ] * Filter[ffc // in_filter_channels, ffc % in_filter_channels, ry, rx, ffb] - ).astype(args["accumulator"]), + ).astype(out_dtype), axis=[ry, rx], ), - tag="depthwise_conv2d_nchwc_kcrsk", + tag="depthwise_conv2d_nchwc", ) if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning: @@ -198,13 +170,13 @@ def compute_depthwise_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilatio return te.compute( (batch, out_channles, out_height_orig, out_width_orig), lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % out_channel_block], - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_dw_conv2d_latest_op", ) else: return te.compute( (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block), lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype), - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_dw_conv2d_latest_op", ) diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py index 2b228b444fca..c27f2a9eae7c 100644 --- a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py +++ b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py @@ -33,48 +33,22 @@ ) -@autotvm.register_topi_compute("depthwise_conv2d_nhwc.image2d") -def depthwise_conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): - """Compute depthwise_conv2d with NHWC layout""" - args = {"shared": False, "accumulator": "float16"} - return compute_depthwise_conv2d_NHWC_HWOI( - data, kernel, strides, padding, dilation, out_dtype, args=args - ) - - -@autotvm.register_topi_compute("depthwise_conv2d_nhwc_acc32.image2d") -def depthwise_conv2d_nhwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"): - """Compute depthwise_conv2d with NHWC layout""" - args = {"shared": False, "accumulator": "float32"} - return compute_depthwise_conv2d_NHWC_HWOI( - data, kernel, strides, padding, dilation, out_dtype, args=args - ) - - @autotvm.register_topi_schedule("depthwise_conv2d_nhwc.image2d") def schedule_depthwise_conv2d_nhwc(cfg, outs): - return schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc16") - - -@autotvm.register_topi_schedule("depthwise_conv2d_nhwc_acc32.image2d") -def schedule_depthwise_conv2d_nhwc_acc32(cfg, outs): - return schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc32") - - -def schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag): """Create the schedule for depthwise conv2d_nchw4c_ohwi4o""" outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) def _callback(op): - if op.tag == tag: + if op.tag == "adreno_dw_conv2d_latest_op": schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, op.output(0)) traverse_inline(s, outs[0].op, _callback) return s -def compute_depthwise_conv2d_NHWC_HWOI(Input, Filter, stride, padding, dilation, out_dtype, args): +@autotvm.register_topi_compute("depthwise_conv2d_nhwc.image2d") +def depthwise_conv2d_nhwc(cfg, Input, Filter, stride, padding, dilation, out_dtype): """ Depthwise convolution operator in NCHWc layout. Algo: @@ -175,7 +149,7 @@ def compute_depthwise_conv2d_NHWC_HWOI(Input, Filter, stride, padding, dilation, ( temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffc, ffb] * Filter[ry, rx, ffc, 0, ffb] - ).astype(args["accumulator"]), + ).astype(out_dtype), axis=[ry, rx], ), tag="depthwise_conv2d_nhwc", @@ -190,13 +164,13 @@ def compute_depthwise_conv2d_NHWC_HWOI(Input, Filter, stride, padding, dilation, return te.compute( (batch, out_height_orig, out_width_orig, out_channles), lambda n, y, x, c: dummy_cast[n, y, x, c // out_channel_block, c % out_channel_block], - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_dw_conv2d_latest_op", ) else: return te.compute( (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block), lambda n, y, x, ffc, ffb: conv[n, y, x, ffc, ffb].astype(out_dtype), - tag="cast_from_acc" + args["accumulator"][-2:], + tag="adreno_dw_conv2d_latest_op", ) diff --git a/tests/python/relay/test_conv2d_nchw_texture.py b/tests/python/relay/test_conv2d_nchw_texture.py index 6eadd8fc1c7a..ab12e40b39cb 100644 --- a/tests/python/relay/test_conv2d_nchw_texture.py +++ b/tests/python/relay/test_conv2d_nchw_texture.py @@ -437,7 +437,7 @@ def test_conv2d_vgg16_winograd_4d(): stat_file = temp.relpath("stat.log") with open(stat_file, "w") as f: f.write( - '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd_acc32.image2d", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' + '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' ) graph = build_run_compare( mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file @@ -486,7 +486,7 @@ def test_conv2d_winograd_conv(): stat_file = temp.relpath("stat.log") with open(stat_file, "w") as f: f.write( - '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd_acc32.image2d", [["TENSOR", [1, 4, 3, 3], "float16"], ["TENSOR", [8, 4, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' + '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 4, 3, 3], "float16"], ["TENSOR", [8, 4, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' ) graph = build_run_compare( mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file diff --git a/tests/python/relay/test_conv2d_nhwc_texture.py b/tests/python/relay/test_conv2d_nhwc_texture.py index be5cefd46038..cf8116c076cc 100644 --- a/tests/python/relay/test_conv2d_nhwc_texture.py +++ b/tests/python/relay/test_conv2d_nhwc_texture.py @@ -598,7 +598,7 @@ def test_conv2d_vgg16_winograd_4d(): stat_file = temp.relpath("stat.log") with open(stat_file, "w") as f: f.write( - '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd_acc32.image2d", [["TENSOR", [1, 28, 28, 512], "float16"], ["TENSOR", [3, 3, 512, 512], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' + '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 28, 28, 512], "float16"], ["TENSOR", [3, 3, 512, 512], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' ) graph = build_run_compare( mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file From 54786bbff340426109de7785bb2de4c1dfc2a738 Mon Sep 17 00:00:00 2001 From: Christian Convey Date: Thu, 1 Sep 2022 16:51:33 -0400 Subject: [PATCH 091/704] [hexagon][tests] re-enable maxpool hardware test (#12676) - Re-enable test_max_pool2d_slice.py when run on Hexagon hardware (as opposed to hexagon-sim). This is now safe because https://github.com/apache/tvm/issues/11928 has been fixed. --- .../python/contrib/test_hexagon/topi/test_max_pool2d_slice.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py index 373a59e0b613..f827f025af17 100644 --- a/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py +++ b/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py @@ -330,9 +330,6 @@ def test_max_pool2d_slice( expected_output_np, hexagon_session: Session, ): - if hexagon_session._launcher._serial_number != "simulator": - pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11928") - target_hexagon = tvm.target.hexagon("v69") A = te.placeholder(input_shape_padded, name="A", dtype=dtype) From 50dad0d9a3c85f7692025b5330ceb902e264bb92 Mon Sep 17 00:00:00 2001 From: arangasa <76030063+arangasa@users.noreply.github.com> Date: Fri, 2 Sep 2022 02:49:40 +0530 Subject: [PATCH 092/704] [HEXAGON][TOPI]Slice Op Argmax uint8 (#12472) --- python/tvm/topi/hexagon/slice_ops/argmax.py | 7 +++++++ .../contrib/test_hexagon/topi/test_argmax_slice.py | 14 ++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/python/tvm/topi/hexagon/slice_ops/argmax.py b/python/tvm/topi/hexagon/slice_ops/argmax.py index 4d34cb50a0b0..a3a0ea37c37c 100644 --- a/python/tvm/topi/hexagon/slice_ops/argmax.py +++ b/python/tvm/topi/hexagon/slice_ops/argmax.py @@ -43,4 +43,11 @@ def argmax_schedule(argmax_func, in_layout_str, out_layout_str): argmax_func, fp16_layout_transform, int32_layout_transform ) return tir_s + if (in_layout_str == "nhwc-8h8w32c-2d") and (out_layout_str == "nhw-32h16w-2d"): + int8_layout_transform = get_layout_transform_fn(in_layout_str) + int32_layout_transform = get_layout_transform_fn(out_layout_str) + tir_s = argmax_stir_schedule_nhwc( + argmax_func, int8_layout_transform, int32_layout_transform + ) + return tir_s raise RuntimeError(f"Unexpected input_layout, output_layout '{in_layout_str, out_layout_str}'") diff --git a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py index eaba9fafde3a..32d7a5097384 100644 --- a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py +++ b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. """ Tests for Hexagon slice argmax op """ -import pytest import numpy as np import tvm @@ -33,15 +32,18 @@ class TestArgMaxSlice: input_shape, input_layout, output_layout, + dtype, in_axis, in_axis_sep, out_axis_sep, ) = tvm.testing.parameters( - ((1, 64, 64, 32), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", [3], [4], [3]), - ((3, 32, 16, 32), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", [3], [4], [3]), - ((1, 32, 32, 64), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", [3], [4], [3]), + ((1, 64, 64, 32), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", "float16", [3], [4], [3]), + ((3, 32, 16, 32), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", "float16", [3], [4], [3]), + ((1, 32, 32, 64), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", "float16", [3], [4], [3]), + ((1, 64, 64, 32), "nhwc-8h8w32c-2d", "nhw-32h16w-2d", "int8", [3], [4], [3]), + ((3, 32, 16, 32), "nhwc-8h8w32c-2d", "nhw-32h16w-2d", "int8", [3], [4], [3]), + ((1, 32, 32, 64), "nhwc-8h8w32c-2d", "nhw-32h16w-2d", "int8", [3], [4], [3]), ) - dtype = tvm.testing.parameter("float16") working_scope = tvm.testing.parameter("global.vtcm") @tvm.testing.fixture @@ -96,7 +98,7 @@ def test_argmax_slice( axis_separators=out_axis_sep, mem_scope=working_scope, ) - with tvm.transform.PassContext(opt_level=3, config={"tir.disable_assert": True}): + with tvm.transform.PassContext(opt_level=3): tir_irm = tvm.lower(tir_s.mod, [argmax_input, output], name="argmax") runtime_module = tvm.build( tir_irm, [argmax_input, output], target=target, name="argmax" From eecb7fd494052ca941f3d123daa2e887f14b7e75 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Thu, 1 Sep 2022 16:44:42 -0700 Subject: [PATCH 093/704] [MetaSchedule] Introduce `Union` and `OrderedUnion` in Database (#12628) Following up #12520 and #12626, this PR introduces two database classes: `UnionDatabase` and `OrderedUnionDatabase`, both of which allow users to organically compose multiple databases together, so that the high-level IR (Relay, Relax) could select the best tuning records according to running time or a preferred order given by users. To each query, `UnionDatabase` returns the best record among all the databases given; Instead, `OrderedUnionDatabase` returns he record from the first database that responds to the query. Used together, users may specify complicated dispatching patterns like below: Examples below demonstrate the usecases of and difference between UnionDatabase and OrderDatabase. Assumption: * db1, db2 do not have tuning records for the target workload. * Each of db3, db4, db5 has tuning records r3, r4, r5 for target workload respectively. ```python #### Case 1. `UnionDatabase`: merged_db = ms.database.UnionDatabase( db1, # no record db2, # no record db3, # has r3 db4 # has r4 ) # returns the better one between r3 and r4 merged_db.query_tuning_record(..., target_workload) ### Case 2. `OrderedUnionDatabase` merged_db = ms.database.OrderedUnionDatabase( db1, # no record db2, # no record db3, # has r3 db4 # has r4 ) # returns r3 merged_db.query_tuning_record(..., target_workload) ### Case 3. Mix-use scenario merged_db = ms.database.UnionDatabase( db1, # no record db2, # no record db3, # has r3 ms.database.OrderedUnionDatabase( # returns r4 db4, # has r4 db5, # has r5 ) ) # returns the better one between r3 and r4 merged_db.query_tuning_record(..., target_workload) ### Case 4. Another mix-use scenario merged_db = ms.database.UnionDatabase( db1, # no record db2, # no record db3, # has r3 ms.database.UnionDatabase( # returns the better one between r4 and r5 db4, # has r4 db5, # has r5 ) ) # returns the best one among r3, r4 and r5 merged_db.query_tuning_record(..., target_workload) ### Case 5. Yet another mix-use scenario merged_db = ms.database.OrderedUnionDatabase( db1, # no record db2, # no record ms.database.UnionDatabase( # returns the better one between r3 and r4 db3, # has r3 db4, # has r4 ) db5, # has r5 ) # returns the better one between r3 and r4 merged_db.query_tuning_record(..., target_workload) ``` Co-authored-by: sunggg <49998730+sunggg@users.noreply.github.com> --- include/tvm/meta_schedule/database.h | 16 +++ python/tvm/meta_schedule/database/__init__.py | 2 + .../database/ordered_union_database.py | 112 ++++++++++++++++++ .../meta_schedule/database/union_database.py | 112 ++++++++++++++++++ src/meta_schedule/database/json_database.cc | 22 ---- .../database/ordered_union_database.cc | 86 ++++++++++++++ src/meta_schedule/database/union_database.cc | 88 ++++++++++++++ src/meta_schedule/utils.h | 22 ++++ tests/python/unittest/test_link_params.py | 9 +- .../unittest/test_meta_schedule_database.py | 37 ++++++ 10 files changed, 477 insertions(+), 29 deletions(-) create mode 100644 python/tvm/meta_schedule/database/ordered_union_database.py create mode 100644 python/tvm/meta_schedule/database/union_database.py create mode 100644 src/meta_schedule/database/ordered_union_database.cc create mode 100644 src/meta_schedule/database/union_database.cc diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h index 88db2e227786..fa488a38ce0a 100644 --- a/include/tvm/meta_schedule/database.h +++ b/include/tvm/meta_schedule/database.h @@ -357,6 +357,22 @@ class Database : public runtime::ObjectRef { */ TVM_DLL static Database JSONDatabase(String path_workload, String path_tuning_record, bool allow_missing); + /*! + * \brief A database composed of multiple databases, allowing users to guide IR rewriting using + * combined knowledge of those databases. To each query, it returns the best record among all the + * databases given. + * \param databases The list of databases to be combined. + * \return The combined database. + */ + TVM_DLL static Database UnionDatabase(Array databases); + /*! + * \brief A database composed of multiple databases, allowing users to guide IR rewriting using + * combined knowledge of those databases. To each query, it returns the record from the first + * database that responds to the query. + * \param databases The database to be subsetted. + * \return The subsetted database. + */ + TVM_DLL static Database OrderedUnionDatabase(Array databases); /*! * \brief Create a database with customized methods on the python-side. * \param f_has_workload The packed function of `HasWorkload`. diff --git a/python/tvm/meta_schedule/database/__init__.py b/python/tvm/meta_schedule/database/__init__.py index 7726daf6eb63..679923e47936 100644 --- a/python/tvm/meta_schedule/database/__init__.py +++ b/python/tvm/meta_schedule/database/__init__.py @@ -21,4 +21,6 @@ from .database import Database, PyDatabase, TuningRecord, Workload from .json_database import JSONDatabase from .memory_database import MemoryDatabase +from .ordered_union_database import OrderedUnionDatabase from .schedule_fn_database import ScheduleFnDatabase +from .union_database import UnionDatabase diff --git a/python/tvm/meta_schedule/database/ordered_union_database.py b/python/tvm/meta_schedule/database/ordered_union_database.py new file mode 100644 index 000000000000..35b0a9e282c1 --- /dev/null +++ b/python/tvm/meta_schedule/database/ordered_union_database.py @@ -0,0 +1,112 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""A database consists of multiple databases.""" +from tvm._ffi import register_object + +from .. import _ffi_api +from .database import Database + + +@register_object("meta_schedule.OrderedUnionDatabase") +class OrderedUnionDatabase(Database): + """A database composed of multiple databases, allowing users to guide IR rewriting using + combined knowledge of those databases. To each query, it returns the record from the first + database that responds to the query. + + Examples + -------- + Examples below demonstrate the usecases of and difference between UnionDatabase and + OrderDatabase. + + Assumption: + * db1, db2 do not have tuning records for the target workload. + * Each of db3, db4, db5 has tuning records r3, r4, r5 for target workload respectively. + + .. code-block:: python + + #### Case 1. `UnionDatabase`: + merged_db = ms.database.UnionDatabase( + db1, # no record + db2, # no record + db3, # has r3 + db4 # has r4 + ) + # returns the better one between r3 and r4 + merged_db.query_tuning_record(..., target_workload) + + ### Case 2. `OrderedUnionDatabase` + merged_db = ms.database.OrderedUnionDatabase( + db1, # no record + db2, # no record + db3, # has r3 + db4 # has r4 + ) + # returns r3 + merged_db.query_tuning_record(..., target_workload) + + ### Case 3. Mix-use scenario + merged_db = ms.database.UnionDatabase( + db1, # no record + db2, # no record + db3, # has r3 + ms.database.OrderedUnionDatabase( # returns r4 + db4, # has r4 + db5, # has r5 + ) + ) + # returns the better one between r3 and r4 + merged_db.query_tuning_record(..., target_workload) + + ### Case 4. Another mix-use scenario + merged_db = ms.database.UnionDatabase( + db1, # no record + db2, # no record + db3, # has r3 + ms.database.UnionDatabase( # returns best one between r4 and r5 + db4, # has r4 + db5, # has r5 + ) + ) + # returns the best one among r3, r4 and r5 + merged_db.query_tuning_record(..., target_workload) + + ### Case 5. Yet another mix-use scenario + merged_db = ms.database.OrderedUnionDatabase( + db1, # no record + db2, # no record + ms.database.UnionDatabase( # returns best one between r3 and r4 + db3, # has r3 + db4, # has r4 + ) + db5, # has r5 + ) + # returns the better one between r3 and r4 + merged_db.query_tuning_record(..., target_workload) + """ + + def __init__(self, *databases: Database) -> None: + """Construct a merged database from multiple databases. + + Parameters + ---------- + *databases : Database + The list of databases to combine. + """ + self.__init_handle_by_constructor__( + _ffi_api.DatabaseOrderedUnionDatabase, # type: ignore # pylint: disable=no-member + databases, + ) diff --git a/python/tvm/meta_schedule/database/union_database.py b/python/tvm/meta_schedule/database/union_database.py new file mode 100644 index 000000000000..ae55ebe79614 --- /dev/null +++ b/python/tvm/meta_schedule/database/union_database.py @@ -0,0 +1,112 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""A database consists of multiple databases.""" +from tvm._ffi import register_object + +from .. import _ffi_api +from .database import Database + + +@register_object("meta_schedule.UnionDatabase") +class UnionDatabase(Database): + """A database composed of multiple databases, allowing users to guide IR rewriting using + combined knowledge of those databases. To each query, it returns the best record among all the + databases given. + + Examples + -------- + Examples below demonstrate the usecases of and difference between UnionDatabase and + OrderDatabase. + + Assumption: + * db1, db2 do not have tuning records for the target workload. + * Each of db3, db4, db5 has tuning records r3, r4, r5 for target workload respectively. + + .. code-block:: python + + #### Case 1. `UnionDatabase`: + merged_db = ms.database.UnionDatabase( + db1, # no record + db2, # no record + db3, # has r3 + db4 # has r4 + ) + # returns the better one between r3 and r4 + merged_db.query_tuning_record(..., target_workload) + + ### Case 2. `OrderedUnionDatabase` + merged_db = ms.database.OrderedUnionDatabase( + db1, # no record + db2, # no record + db3, # has r3 + db4 # has r4 + ) + # returns r3 + merged_db.query_tuning_record(..., target_workload) + + ### Case 3. Mix-use scenario + merged_db = ms.database.UnionDatabase( + db1, # no record + db2, # no record + db3, # has r3 + ms.database.OrderedUnionDatabase( # returns r4 + db4, # has r4 + db5, # has r5 + ) + ) + # returns the better one between r3 and r4 + merged_db.query_tuning_record(..., target_workload) + + ### Case 4. Another mix-use scenario + merged_db = ms.database.UnionDatabase( + db1, # no record + db2, # no record + db3, # has r3 + ms.database.UnionDatabase( # returns best one between r4 and r5 + db4, # has r4 + db5, # has r5 + ) + ) + # returns the best one among r3, r4 and r5 + merged_db.query_tuning_record(..., target_workload) + + ### Case 5. Yet another mix-use scenario + merged_db = ms.database.OrderedUnionDatabase( + db1, # no record + db2, # no record + ms.database.UnionDatabase( # returns best one between r3 and r4 + db3, # has r3 + db4, # has r4 + ) + db5, # has r5 + ) + # returns the better one between r3 and r4 + merged_db.query_tuning_record(..., target_workload) + """ + + def __init__(self, *databases: Database) -> None: + """Construct a merged database from multiple databases. + + Parameters + ---------- + *databases : Database + The list of databases to combine. + """ + self.__init_handle_by_constructor__( + _ffi_api.DatabaseUnionDatabase, # type: ignore # pylint: disable=no-member + databases, + ) diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc index 2e4f85260835..91b96c82479f 100644 --- a/src/meta_schedule/database/json_database.cc +++ b/src/meta_schedule/database/json_database.cc @@ -25,28 +25,6 @@ namespace tvm { namespace meta_schedule { -/*! \brief The struct defining comparison function of sorting by mean run seconds. */ -struct SortTuningRecordByMeanRunSecs { - static const constexpr double kMaxMeanTime = 1e10; - - static double Mean(const Array& a) { - if (a.empty()) { - return kMaxMeanTime; - } - double sum = 0.0; - for (const FloatImm& i : a) { - sum += i->value; - } - return sum / a.size(); - } - - bool operator()(const TuningRecord& a, const TuningRecord& b) const { - double a_time = Mean(a->run_secs.value_or({})); - double b_time = Mean(b->run_secs.value_or({})); - return a_time < b_time; - } -}; - /*! * \brief Read lines from a json file. * \param path The path to the json file. diff --git a/src/meta_schedule/database/ordered_union_database.cc b/src/meta_schedule/database/ordered_union_database.cc new file mode 100644 index 000000000000..3aaee2112c0c --- /dev/null +++ b/src/meta_schedule/database/ordered_union_database.cc @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "../utils.h" + +namespace tvm { +namespace meta_schedule { + +class OrderedUnionDatabaseNode : public DatabaseNode { + public: + Array databases; + + void VisitAttrs(AttrVisitor* v) { v->Visit("databases", &databases); } + + static constexpr const char* _type_key = "meta_schedule.OrderedUnionDatabase"; + TVM_DECLARE_FINAL_OBJECT_INFO(OrderedUnionDatabaseNode, DatabaseNode); + + public: + Optional QueryTuningRecord(const IRModule& mod, const Target& target, + const String& task_name) final { + for (const Database& db : databases) { + if (Optional record = db->QueryTuningRecord(mod, target, task_name)) { + return record; + } + } + return NullOpt; + } + + bool HasWorkload(const IRModule& mod) final { + LOG(FATAL) << "NotImplementedError: OrderedUnionDatabase.HasWorkload"; + throw; + } + + Workload CommitWorkload(const IRModule& mod) final { + LOG(FATAL) << "NotImplementedError: OrderedUnionDatabase.CommitWorkload"; + throw; + } + + void CommitTuningRecord(const TuningRecord& record) final { + LOG(FATAL) << "NotImplementedError: OrderedUnionDatabase.CommitTuningRecord"; + throw; + } + + Array GetTopK(const Workload& workload, int top_k) final { + LOG(FATAL) << "NotImplementedError: OrderedUnionDatabase.GetTopK"; + throw; + } + + Array GetAllTuningRecords() final { + LOG(FATAL) << "NotImplementedError: OrderedUnionDatabase.GetAllTuningRecords"; + throw; + } + + int64_t Size() final { + LOG(FATAL) << "NotImplementedError: OrderedUnionDatabase.size"; + throw; + } +}; + +Database Database::OrderedUnionDatabase(Array databases) { + ObjectPtr n = make_object(); + n->databases = std::move(databases); + return Database(n); +} + +TVM_REGISTER_NODE_TYPE(OrderedUnionDatabaseNode); +TVM_REGISTER_GLOBAL("meta_schedule.DatabaseOrderedUnionDatabase") + .set_body_typed(Database::OrderedUnionDatabase); + +} // namespace meta_schedule +} // namespace tvm diff --git a/src/meta_schedule/database/union_database.cc b/src/meta_schedule/database/union_database.cc new file mode 100644 index 000000000000..6d19a38c6d9e --- /dev/null +++ b/src/meta_schedule/database/union_database.cc @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "../utils.h" + +namespace tvm { +namespace meta_schedule { + +class UnionDatabaseNode : public DatabaseNode { + public: + Array databases; + + void VisitAttrs(AttrVisitor* v) { v->Visit("databases", &databases); } + + static constexpr const char* _type_key = "meta_schedule.UnionDatabase"; + TVM_DECLARE_FINAL_OBJECT_INFO(UnionDatabaseNode, DatabaseNode); + + public: + Optional QueryTuningRecord(const IRModule& mod, const Target& target, + const String& task_name) final { + std::vector results; + results.reserve(databases.size()); + for (const Database& db : databases) { + if (Optional record = db->QueryTuningRecord(mod, target, task_name)) { + results.push_back(record.value()); + } + } + std::stable_sort(results.begin(), results.end(), SortTuningRecordByMeanRunSecs()); + return results.empty() ? Optional(NullOpt) : results[0]; + } + + bool HasWorkload(const IRModule& mod) final { + LOG(FATAL) << "NotImplementedError: UnionDatabase.HasWorkload"; + throw; + } + + Workload CommitWorkload(const IRModule& mod) final { + LOG(FATAL) << "NotImplementedError: UnionDatabase.CommitWorkload"; + throw; + } + + void CommitTuningRecord(const TuningRecord& record) final { + LOG(FATAL) << "NotImplementedError: UnionDatabase.CommitTuningRecord"; + throw; + } + + Array GetTopK(const Workload& workload, int top_k) final { + LOG(FATAL) << "NotImplementedError: UnionDatabase.GetTopK"; + throw; + } + + Array GetAllTuningRecords() final { + LOG(FATAL) << "NotImplementedError: UnionDatabase.GetAllTuningRecords"; + throw; + } + + int64_t Size() final { + LOG(FATAL) << "NotImplementedError: UnionDatabase.size"; + throw; + } +}; + +Database Database::UnionDatabase(Array databases) { + ObjectPtr n = make_object(); + n->databases = std::move(databases); + return Database(n); +} + +TVM_REGISTER_NODE_TYPE(UnionDatabaseNode); +TVM_REGISTER_GLOBAL("meta_schedule.DatabaseUnionDatabase").set_body_typed(Database::UnionDatabase); + +} // namespace meta_schedule +} // namespace tvm diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h index db37935ec206..ad56fa7f6a52 100644 --- a/src/meta_schedule/utils.h +++ b/src/meta_schedule/utils.h @@ -404,6 +404,28 @@ inline Array AsIntArray(const ObjectRef& obj) { return results; } +/*! \brief The struct defining comparison function of sorting by mean run seconds. */ +struct SortTuningRecordByMeanRunSecs { + static const constexpr double kMaxMeanTime = 1e10; + + static double Mean(const Array& a) { + if (a.empty()) { + return kMaxMeanTime; + } + double sum = 0.0; + for (const FloatImm& i : a) { + sum += i->value; + } + return sum / a.size(); + } + + bool operator()(const TuningRecord& a, const TuningRecord& b) const { + double a_time = Mean(a->run_secs.value_or({})); + double b_time = Mean(b->run_secs.value_or({})); + return a_time < b_time; + } +}; + } // namespace meta_schedule } // namespace tvm diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py index b14c18e55f4b..e5b8cd77445f 100644 --- a/tests/python/unittest/test_link_params.py +++ b/tests/python/unittest/test_link_params.py @@ -412,17 +412,12 @@ def schedule_fn(sch): return True return False - link_params = True - with StringIO() as stderr_buf, redirect_stderr(stderr_buf): with ms.database.ScheduleFnDatabase(schedule_fn), tvm.transform.PassContext( opt_level=3, - config={ - "relay.backend.use_meta_schedule": True, - "relay.FuseOps.link_params": link_params, - }, + config={"relay.backend.use_meta_schedule": True}, ): - executor = Executor("graph", {"link-params": link_params}) + executor = Executor("graph", {"link-params": True}) lib = relay.build(relay_mod, target=target, executor=executor) # Workload look up should succeed. This does not work when the test is invoked from pytest. diff --git a/tests/python/unittest/test_meta_schedule_database.py b/tests/python/unittest/test_meta_schedule_database.py index ff0f350d8914..e6342f1c3536 100644 --- a/tests/python/unittest/test_meta_schedule_database.py +++ b/tests/python/unittest/test_meta_schedule_database.py @@ -294,5 +294,42 @@ def test_meta_schedule_database_reload(): _equal_record(ret[1], records[2]) +def test_meta_schedule_database_union(): + mod: IRModule = Matmul + target = tvm.target.Target("llvm") + arg_info = ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]) + db_1 = ms.database.MemoryDatabase() + db_2 = ms.database.MemoryDatabase() + trace = _create_schedule(mod, _schedule_matmul).trace + + def query(db): + return db.query_tuning_record(mod=mod, target=target, workload_name="main").run_secs + + def commit_record(db, run_sec): + db.commit_tuning_record( + ms.database.TuningRecord( + trace, + workload=db.commit_workload(mod), + run_secs=[run_sec], + target=target, + args_info=arg_info, + ) + ) + + commit_record(db_1, 1.0) + (run_sec,) = query(db_1) + assert run_sec.value == 1.0 + + commit_record(db_2, 0.5) + (run_sec,) = query(db_2) + assert run_sec.value == 0.5 + + (run_secs,) = query(ms.database.UnionDatabase(db_1, db_2)) + assert run_secs.value == 0.5 + + (run_secs,) = query(ms.database.OrderedUnionDatabase(db_1, db_2)) + assert run_secs.value == 1.0 + + if __name__ == "__main__": tvm.testing.main() From 8ca8f24d54d65be552448e5368d879710930711b Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 1 Sep 2022 18:56:32 -0700 Subject: [PATCH 094/704] [TIR] Handle DeclBuffer in ToSSA (#12679) --- include/tvm/tir/stmt.h | 1 + src/tir/transforms/ir_utils.cc | 9 +++++++++ tests/python/unittest/test_tir_transform_unroll_loop.py | 9 +++------ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h index bee9819a228e..e16d773f02b3 100644 --- a/include/tvm/tir/stmt.h +++ b/include/tvm/tir/stmt.h @@ -713,6 +713,7 @@ class DeclBuffer : public Stmt { public: TVM_DLL DeclBuffer(Buffer buffer, Stmt body, Span span = Span()); TVM_DEFINE_OBJECT_REF_METHODS(DeclBuffer, Stmt, DeclBufferNode); + TVM_DEFINE_OBJECT_REF_COW_METHOD(DeclBufferNode); }; /*! diff --git a/src/tir/transforms/ir_utils.cc b/src/tir/transforms/ir_utils.cc index 66b04bd67892..b7e3e01f7506 100644 --- a/src/tir/transforms/ir_utils.cc +++ b/src/tir/transforms/ir_utils.cc @@ -132,6 +132,15 @@ class IRConvertSSA final : public StmtExprMutator { return std::move(output); } + Stmt VisitStmt_(const DeclBufferNode* op) final { + DeclBuffer decl = Downcast(StmtExprMutator::VisitStmt_(op)); + Buffer new_buffer = GetRemappedBuffer(decl->buffer); + if (!new_buffer.same_as(decl->buffer)) { + decl.CopyOnWrite()->buffer = std::move(new_buffer); + } + return std::move(decl); + } + template Node VisitBufferAccess(Node node) { Buffer new_buf = GetRemappedBuffer(node->buffer); diff --git a/tests/python/unittest/test_tir_transform_unroll_loop.py b/tests/python/unittest/test_tir_transform_unroll_loop.py index 3a638ba45122..a76e6135b3c4 100644 --- a/tests/python/unittest/test_tir_transform_unroll_loop.py +++ b/tests/python/unittest/test_tir_transform_unroll_loop.py @@ -117,19 +117,16 @@ class before: @T.prim_func def main(): for i in T.unroll(2): - with T.allocate([16], "float32", "global") as buf_data: - buf = T.buffer_decl(shape=[16], dtype="float32", data=buf_data) + with T.decl_buffer([16], "float32") as buf: buf[0] = 0.0 @tvm.script.ir_module class expected: @T.prim_func def main(): - with T.allocate([16], "float32", "global") as buf1_data: - buf1 = T.buffer_decl(shape=[16], dtype="float32", data=buf1_data) + with T.decl_buffer([16], "float32") as buf1: buf1[0] = 0.0 - with T.allocate([16], "float32", "global") as buf2_data: - buf2 = T.buffer_decl(shape=[16], dtype="float32", data=buf2_data) + with T.decl_buffer([16], "float32") as buf2: buf2[0] = 0.0 after = tvm.tir.transform.UnrollLoop()(before) From 4acddb1d036a5f055f5e62f348b18c5e8562140e Mon Sep 17 00:00:00 2001 From: Siyuan Feng Date: Fri, 2 Sep 2022 13:13:20 +0800 Subject: [PATCH 095/704] [COMMUNITY] Yaxing Cai -> Reviewer (#12683) Please join me in welcoming Yaxing Cai (@cyx-6) as a new reviewer in TVM. Yaxing has brought the PackedFunc into TVM object system ([RFC-051](https://github.com/apache/tvm-rfcs/pull/51)), designed and implemented the new parser infrastructure for TVMScript and meta-programming ([RFC-079](https://github.com/apache/tvm-rfcs/pull/79)) - [Commits History](https://github.com/apache/tvm/commits?author=cyx-6) - [Code Review](https://github.com/apache/tvm/pulls?q=reviewed-by%3Acyx-6+) --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 771eb1c63eda..01cf7058a069 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -92,6 +92,7 @@ We do encourage everyone to work anything they are interested in. - [Matthew Barrett](https://github.com/mbaret): @mbaret - [Arnaud Bergeron](https://github.com/abergeron): @abergeron - [Matthew Brookhart](https://github.com/mbrookhart): @mbrookhart +- [Yaxing Cai](https://github.com/cyx-6): @cyx-6 - [Liangfu Chen](https://github.com/liangfu): @liangfu - [Tianqi Chen](https://github.com/tqchen): @tqchen - [Zhi Chen](https://github.com/zhiics): @zhiics From b2d660006446f720f0c9488f96d28387cbd0d294 Mon Sep 17 00:00:00 2001 From: Yuanjing Shi Date: Fri, 2 Sep 2022 00:24:04 -0700 Subject: [PATCH 096/704] [PyTorch] Fix aten::arange for pytorch (#12681) fix arange for pytorch nightly 20220815 --- python/tvm/relay/frontend/pytorch.py | 32 ++++++++++++---------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index 2255396c0633..7c52393b8468 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -348,28 +348,24 @@ def _get_type(val, inp_type): # - if a dtype is given, start, stop, step are converted to that dtype # - if no dtype is given and all args are integral, dtype is int64 # - if no dtype is given and there is a float arg, dtype is float32 - if len(inputs) == 5: - dtype0 = _get_type(inputs[0], input_types[0]) - if inputs[1] is not None: - dtype = _convert_dtype_value(inputs[1]) - elif dtype0.startswith("float"): - dtype = "float32" - else: - dtype = "int64" - start = _expr.const(0, dtype) - stop = _get_value(inputs[0], dtype) - step = _expr.const(1, dtype) - elif len(inputs) == 7: - types = [_get_type(inputs[i], input_types[i]) for i in range(3)] - if inputs[3] is not None: - dtype = _convert_dtype_value(inputs[3]) + if len(inputs) in {5, 6, 7}: + # inputs look like [_,_,_,dtype,layout,device,requires_grad] + # therefore dtype_idx is always the length of inputs minus 4 + dtype_idx = len(inputs) - 4 + types = [_get_type(inputs[i], input_types[i]) for i in range(dtype_idx)] + if inputs[dtype_idx] is not None: + dtype = _convert_dtype_value(inputs[dtype_idx]) elif any([t.startswith("float") for t in types]): dtype = "float32" else: dtype = "int64" - start = _get_value(inputs[0], dtype) - stop = _get_value(inputs[1], dtype) - step = _get_value(inputs[2], dtype) + + # - if len(inputs) == 5, inputs = [stop, dtype, ...] + # - if len(inputs) == 6, inputs = [start, stop, dtype, ...] + # - if len(inputs) == 7, inputs = [start, stop, step, dtype, ...] + start = _get_value(inputs[0], dtype) if len(inputs) > 5 else _expr.const(0, dtype) + stop = _get_value(inputs[1 if len(inputs) > 5 else 0], dtype) + step = _get_value(inputs[2], dtype) if len(inputs) > 6 else _expr.const(1, dtype) else: msg = "Unknown number of arguments (%d) to parse." % (len(inputs)) raise AssertionError(msg) From bb56f2a972606b33e5479d1e18d4c4f13751eeed Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Fri, 2 Sep 2022 00:47:38 -0700 Subject: [PATCH 097/704] [MetaSchedule][UX] Convenient Object Creation (#12643) This PR introduces a set of `.create` methods making it easier to create MetaSchedule objects. For example: ```python ms.database.JSONDatabase(...) ms.database.create("json") ms.runner.RPCRunner(...) ms.runner.create("rpc") ``` Besides, this PR allows `JSONDatabase` to be created via `work_dir`: ```python db = ms.database.create("json", work_dir="/path/to/db/") db = ms.database.create(work_dir="/path/to/db/") # or even simpler ``` --- python/tvm/meta_schedule/builder/__init__.py | 2 +- python/tvm/meta_schedule/builder/builder.py | 17 ++++++++ python/tvm/meta_schedule/database/__init__.py | 2 +- python/tvm/meta_schedule/database/database.py | 41 ++++++++++++++++++- .../meta_schedule/database/json_database.py | 31 +++++++++++--- python/tvm/meta_schedule/runner/__init__.py | 12 +++++- python/tvm/meta_schedule/runner/runner.py | 22 +++++++++- .../meta_schedule/search_strategy/__init__.py | 2 +- .../search_strategy/search_strategy.py | 29 +++++++++++++ .../meta_schedule/space_generator/__init__.py | 2 +- .../space_generator/space_generator.py | 28 +++++++++++++ .../meta_schedule/task_scheduler/__init__.py | 4 +- .../task_scheduler/task_scheduler.py | 20 +++++++++ .../meta_schedule/testing/relay_workload.py | 4 +- 14 files changed, 198 insertions(+), 18 deletions(-) diff --git a/python/tvm/meta_schedule/builder/__init__.py b/python/tvm/meta_schedule/builder/__init__.py index 859c74d75622..ac71e3a0c1fc 100644 --- a/python/tvm/meta_schedule/builder/__init__.py +++ b/python/tvm/meta_schedule/builder/__init__.py @@ -19,5 +19,5 @@ Meta Schedule builders that translate IRModule to runtime.Module, and then export """ -from .builder import Builder, BuilderInput, BuilderResult, PyBuilder +from .builder import Builder, BuilderInput, BuilderResult, PyBuilder, create from .local_builder import LocalBuilder diff --git a/python/tvm/meta_schedule/builder/builder.py b/python/tvm/meta_schedule/builder/builder.py index daa9f7be4214..a2254f243380 100644 --- a/python/tvm/meta_schedule/builder/builder.py +++ b/python/tvm/meta_schedule/builder/builder.py @@ -17,6 +17,10 @@ """Meta Schedule builders that translate IRModule to runtime.Module, and then export""" from typing import Callable, Dict, List, Optional +# isort: off +from typing_extensions import Literal + +# isort: on from tvm._ffi import register_object from tvm.ir import IRModule from tvm.runtime import NDArray, Object @@ -164,3 +168,16 @@ def build(self, build_inputs: List[BuilderInput]) -> List[BuilderResult]: The results of building the given inputs. """ raise NotImplementedError + + +def create( # pylint: disable=keyword-arg-before-vararg + kind: Literal["local"] = "local", + *args, + **kwargs, +) -> Builder: + """Create a Builder.""" + from . import LocalBuilder # pylint: disable=import-outside-toplevel + + if kind == "local": + return LocalBuilder(*args, **kwargs) # type: ignore + raise ValueError(f"Unknown Builder: {kind}") diff --git a/python/tvm/meta_schedule/database/__init__.py b/python/tvm/meta_schedule/database/__init__.py index 679923e47936..66d011ed5246 100644 --- a/python/tvm/meta_schedule/database/__init__.py +++ b/python/tvm/meta_schedule/database/__init__.py @@ -18,7 +18,7 @@ The tvm.meta_schedule.database package. The database that stores serialized tuning records and workloads """ -from .database import Database, PyDatabase, TuningRecord, Workload +from .database import Database, PyDatabase, TuningRecord, Workload, create from .json_database import JSONDatabase from .memory_database import MemoryDatabase from .ordered_union_database import OrderedUnionDatabase diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py index aa509b715132..7a1338f46b20 100644 --- a/python/tvm/meta_schedule/database/database.py +++ b/python/tvm/meta_schedule/database/database.py @@ -17,12 +17,16 @@ """TuningRecord database""" from typing import Any, Callable, List, Optional, Union +# isort: off +from typing_extensions import Literal + +# isort: on + from tvm._ffi import register_object from tvm.ir.module import IRModule from tvm.runtime import Object from tvm.target import Target from tvm.tir.schedule import Schedule, Trace -from typing_extensions import Literal # pylint: disable=wrong-import-order from .. import _ffi_api from ..arg_info import ArgInfo @@ -483,3 +487,38 @@ def __len__(self) -> int: The number of records in the database """ raise NotImplementedError + + +def create( # pylint: disable=keyword-arg-before-vararg + kind: Union[ + Literal[ + "json", + "memory", + "union", + "ordered_union", + ], + Callable[[Schedule], bool], + ] = "json", + *args, + **kwargs, +) -> Database: + """Create a Database.""" + from . import ( # pylint: disable=import-outside-toplevel + JSONDatabase, + MemoryDatabase, + OrderedUnionDatabase, + ScheduleFnDatabase, + UnionDatabase, + ) + + if callable(kind): + return ScheduleFnDatabase(kind, *args, **kwargs) # type: ignore + if kind == "json": + return JSONDatabase(*args, **kwargs) + if kind == "memory": + return MemoryDatabase(*args, **kwargs) # type: ignore + if kind == "union": + return UnionDatabase(*args, **kwargs) # type: ignore + if kind == "ordered_union": + return OrderedUnionDatabase(*args, **kwargs) # type: ignore + raise ValueError(f"Unknown Database: {kind}") diff --git a/python/tvm/meta_schedule/database/json_database.py b/python/tvm/meta_schedule/database/json_database.py index 6897b82d9888..b36ac61ef2fb 100644 --- a/python/tvm/meta_schedule/database/json_database.py +++ b/python/tvm/meta_schedule/database/json_database.py @@ -15,6 +15,9 @@ # specific language governing permissions and limitations # under the License. """The default database that uses a JSON File to store tuning records""" +import os.path as osp +from typing import Optional + from tvm._ffi import register_object from .. import _ffi_api @@ -38,21 +41,37 @@ class JSONDatabase(Database): def __init__( self, - path_workload: str, - path_tuning_record: str, + path_workload: Optional[str] = None, + path_tuning_record: Optional[str] = None, + *, + work_dir: Optional[str] = None, allow_missing: bool = True, ) -> None: """Constructor. Parameters ---------- - path_workload : str - The path to the workload table. - path_tuning_record : str - The path to the tuning record table. + path_workload : Optional[str] = None + The path to the workload table. If not specified, + will be generated from `work_dir` as `$work_dir/database_workload.json`. + path_tuning_record : Optional[str] = None + The path to the tuning record table. If not specified, + will be generated from `work_dir` as `$work_dir/database_tuning_record.json`. + work_dir : Optional[str] = None + The work directory, if specified, will be used to generate `path_tuning_record` + and `path_workload`. allow_missing : bool Whether to create new file when the given path is not found. """ + if work_dir is not None: + if path_workload is None: + path_workload = osp.join(work_dir, "database_workload.json") + if path_tuning_record is None: + path_tuning_record = osp.join(work_dir, "database_tuning_record.json") + if path_workload is None: + raise ValueError("`path_workload` is not specified.") + if path_tuning_record is None: + raise ValueError("`path_tuning_record` is not specified.") self.__init_handle_by_constructor__( _ffi_api.DatabaseJSONDatabase, # type: ignore # pylint: disable=no-member path_workload, diff --git a/python/tvm/meta_schedule/runner/__init__.py b/python/tvm/meta_schedule/runner/__init__.py index 413bea6d2fab..f0e1028bbf28 100644 --- a/python/tvm/meta_schedule/runner/__init__.py +++ b/python/tvm/meta_schedule/runner/__init__.py @@ -19,6 +19,14 @@ Meta Schedule runners that runs an artifact either locally or through the RPC interface """ from .config import EvaluatorConfig, RPCConfig -from .rpc_runner import RPCRunner from .local_runner import LocalRunner, LocalRunnerFuture -from .runner import PyRunner, Runner, RunnerFuture, RunnerInput, RunnerResult, PyRunnerFuture +from .rpc_runner import RPCRunner +from .runner import ( + PyRunner, + PyRunnerFuture, + Runner, + RunnerFuture, + RunnerInput, + RunnerResult, + create, +) diff --git a/python/tvm/meta_schedule/runner/runner.py b/python/tvm/meta_schedule/runner/runner.py index 90b53fde8c29..539e47f15c41 100644 --- a/python/tvm/meta_schedule/runner/runner.py +++ b/python/tvm/meta_schedule/runner/runner.py @@ -15,7 +15,12 @@ # specific language governing permissions and limitations # under the License. """Runners""" -from typing import Callable, Optional, List +from typing import Callable, List, Optional + +# isort: off +from typing_extensions import Literal + +# isort: on from tvm._ffi import register_object from tvm.runtime import Object @@ -223,3 +228,18 @@ def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]: The runner futures. """ raise NotImplementedError + + +def create( # pylint: disable=keyword-arg-before-vararg + kind: Literal["local", "rpc"] = "local", + *args, + **kwargs, +) -> Runner: + """Create a Runner.""" + from . import LocalRunner, RPCRunner # pylint: disable=import-outside-toplevel + + if kind == "local": + return LocalRunner(*args, **kwargs) # type: ignore + elif kind == "rpc": + return RPCRunner(*args, **kwargs) # type: ignore + raise ValueError(f"Unknown Runner: {kind}") diff --git a/python/tvm/meta_schedule/search_strategy/__init__.py b/python/tvm/meta_schedule/search_strategy/__init__.py index 2046067d6c00..ffe7e1473954 100644 --- a/python/tvm/meta_schedule/search_strategy/__init__.py +++ b/python/tvm/meta_schedule/search_strategy/__init__.py @@ -23,4 +23,4 @@ from .evolutionary_search import EvolutionarySearch from .replay_func import ReplayFunc from .replay_trace import ReplayTrace -from .search_strategy import MeasureCandidate, PySearchStrategy, SearchStrategy +from .search_strategy import MeasureCandidate, PySearchStrategy, SearchStrategy, create diff --git a/python/tvm/meta_schedule/search_strategy/search_strategy.py b/python/tvm/meta_schedule/search_strategy/search_strategy.py index 1cd8a448fe8e..e88cdf825a79 100644 --- a/python/tvm/meta_schedule/search_strategy/search_strategy.py +++ b/python/tvm/meta_schedule/search_strategy/search_strategy.py @@ -20,6 +20,10 @@ """ from typing import TYPE_CHECKING, Callable, List, Optional +# isort: off +from typing_extensions import Literal + +# isort: on from tvm._ffi import register_object from tvm.runtime import Object from tvm.tir.schedule import Schedule @@ -245,3 +249,28 @@ def notify_runner_results( The profiling results from the runner. """ raise NotImplementedError + + +def create( # pylint: disable=keyword-arg-before-vararg + kind: Literal[ + "evolutionary", + "replay_trace", + "replay_func", + ] = "evolutionary", + *args, + **kwargs, +) -> SearchStrategy: + """Create a search strategy.""" + from . import ( # pylint: disable=import-outside-toplevel + EvolutionarySearch, + ReplayFunc, + ReplayTrace, + ) + + if kind == "evolutionary": + return EvolutionarySearch(*args, **kwargs) + if kind == "replay_trace": + return ReplayTrace(*args, **kwargs) + if kind == "replay_func": + return ReplayFunc(*args, **kwargs) + raise ValueError(f"Unknown SearchStrategy: {kind}") diff --git a/python/tvm/meta_schedule/space_generator/__init__.py b/python/tvm/meta_schedule/space_generator/__init__.py index d2039c4511c9..c417ec2d7d4a 100644 --- a/python/tvm/meta_schedule/space_generator/__init__.py +++ b/python/tvm/meta_schedule/space_generator/__init__.py @@ -21,5 +21,5 @@ """ from .post_order_apply import PostOrderApply from .schedule_fn import ScheduleFn -from .space_generator import PySpaceGenerator, ScheduleFnType, SpaceGenerator +from .space_generator import PySpaceGenerator, ScheduleFnType, SpaceGenerator, create from .space_generator_union import SpaceGeneratorUnion diff --git a/python/tvm/meta_schedule/space_generator/space_generator.py b/python/tvm/meta_schedule/space_generator/space_generator.py index 74c29b4de0dd..9d7ebf3bae26 100644 --- a/python/tvm/meta_schedule/space_generator/space_generator.py +++ b/python/tvm/meta_schedule/space_generator/space_generator.py @@ -20,6 +20,10 @@ """ from typing import TYPE_CHECKING, Callable, List, Optional, Union +# isort: off +from typing_extensions import Literal + +# isort: on from tvm._ffi import register_object from tvm.ir import IRModule from tvm.runtime import Object @@ -132,3 +136,27 @@ def generate_design_space(self, mod: IRModule) -> List[Schedule]: The generated design spaces, i.e., schedules. """ raise NotImplementedError + + +def create( # pylint: disable=keyword-arg-before-vararg + kind: Union[ + Literal["post_order_apply", "union"], + ScheduleFnType, + ] = "post_order_apply", + *args, + **kwargs, +) -> SpaceGenerator: + """Create a design space generator.""" + from . import ( # pylint: disable=import-outside-toplevel + PostOrderApply, + ScheduleFn, + SpaceGeneratorUnion, + ) + + if callable(kind): + return ScheduleFn(kind, *args, **kwargs) # type: ignore + if kind == "post_order_apply": + return PostOrderApply(*args, **kwargs) + if kind == "union": + return SpaceGeneratorUnion(*args, **kwargs) + raise ValueError(f"Unknown SpaceGenerator: {kind}") diff --git a/python/tvm/meta_schedule/task_scheduler/__init__.py b/python/tvm/meta_schedule/task_scheduler/__init__.py index 1a67aa6f6831..51985570b06f 100644 --- a/python/tvm/meta_schedule/task_scheduler/__init__.py +++ b/python/tvm/meta_schedule/task_scheduler/__init__.py @@ -20,6 +20,6 @@ for measure candidates generation and measurement, then save records to the database. """ -from .task_scheduler import TaskScheduler, PyTaskScheduler -from .round_robin import RoundRobin from .gradient_based import GradientBased +from .round_robin import RoundRobin +from .task_scheduler import PyTaskScheduler, TaskScheduler, create diff --git a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py index 3d57a6b01b9d..29a5f18dfb8a 100644 --- a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py +++ b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py @@ -19,6 +19,11 @@ import logging from typing import Callable, List, Optional +# isort: off +from typing_extensions import Literal + +# isort: on + from tvm._ffi import register_object from tvm.runtime import Object @@ -255,3 +260,18 @@ def touch_task(self, task_id: int) -> None: """ # Using self._outer to replace the self pointer _ffi_api.TaskSchedulerTouchTask(self._outer(), task_id) # type: ignore # pylint: disable=no-member + + +def create( # pylint: disable=keyword-arg-before-vararg + kind: Literal["round-robin", "gradient"] = "gradient", + *args, + **kwargs, +) -> "TaskScheduler": + """Create a task scheduler.""" + from . import GradientBased, RoundRobin # pylint: disable=import-outside-toplevel + + if kind == "round-robin": + return RoundRobin(*args, **kwargs) + if kind == "gradient": + return GradientBased(*args, **kwargs) + raise ValueError(f"Unknown TaskScheduler name: {kind}") diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py index 016263489527..f4f6336df33f 100644 --- a/python/tvm/meta_schedule/testing/relay_workload.py +++ b/python/tvm/meta_schedule/testing/relay_workload.py @@ -85,7 +85,7 @@ def _get_network( "float32": torch.float32, # pylint: disable=no-member }[dtype] ) - scripted_model = torch.jit.trace(model, input_data).eval() + scripted_model = torch.jit.trace(model, input_data).eval() # type: ignore input_name = "input0" shape_list = [(input_name, input_shape)] mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) @@ -149,7 +149,7 @@ def _get_network( input_dtype = "int64" a = torch.randint(10000, input_shape) # pylint: disable=no-member model.eval() - scripted_model = torch.jit.trace(model, [a], strict=False) + scripted_model = torch.jit.trace(model, [a], strict=False) # type: ignore input_name = "input_ids" shape_list = [(input_name, input_shape)] mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) From 445a14f4c637ea88f4a1c39ed238da752fc6cecf Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Fri, 2 Sep 2022 08:53:44 +0100 Subject: [PATCH 098/704] [ETHOSN] Fix some more pylint issues (#12675) Fixing a few more pylint issues caught when using pylint==2.9.3. Change-Id: Ie7ca61e1a8083a40e0ffccf1418192966884707a --- tests/python/contrib/test_ethosn/infrastructure.py | 3 ++- .../contrib/test_ethosn/test_convert_equivalents.py | 1 + tests/python/contrib/test_ethosn/test_networks.py | 10 ++++++---- tests/python/contrib/test_ethosn/test_reshape.py | 6 ++++-- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py index 0071b1a7f52e..c658b33747c3 100644 --- a/tests/python/contrib/test_ethosn/infrastructure.py +++ b/tests/python/contrib/test_ethosn/infrastructure.py @@ -67,7 +67,8 @@ def assert_lib_hash(lib, golden): for mod in lib.imported_modules: if mod.type_key == "ethos-n": mod.save(path) - lib_hash = md5(open(path, "rb").read()).hexdigest() + with open(path, "rb") as compiled_model: + lib_hash = md5(compiled_model.read()).hexdigest() hash_set.add(lib_hash) assert hash_set == golden, "Expected hash: {} Got hash: {}".format(golden, hash_set) diff --git a/tests/python/contrib/test_ethosn/test_convert_equivalents.py b/tests/python/contrib/test_ethosn/test_convert_equivalents.py index fe9b346691b6..c8d1b5729d83 100644 --- a/tests/python/contrib/test_ethosn/test_convert_equivalents.py +++ b/tests/python/contrib/test_ethosn/test_convert_equivalents.py @@ -158,6 +158,7 @@ class ConversionChecker(ExprVisitor): sequence = ["qnn.conv2d", "nn.bias_add", "qnn.requantize"] + # pylint: disable=invalid-name def visit_function(self, fn): composite_name = fn.attrs["Composite"] expected = "ethos-n.qnn_conv2d" diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py index db1b41244846..b584a579b8be 100644 --- a/tests/python/contrib/test_ethosn/test_networks.py +++ b/tests/python/contrib/test_ethosn/test_networks.py @@ -34,10 +34,7 @@ def _get_tflite_model(tflite_model_path, inputs_dict, dtype): with open(tflite_model_path, "rb") as f: tflite_model_buffer = f.read() - try: - tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buffer, 0) - except AttributeError: - tflite_model = tflite.Model.GetRootAsModel(tflite_model_buffer, 0) + tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buffer, 0) shape_dict = {} dtype_dict = {} for value in inputs_dict: @@ -116,6 +113,7 @@ def get_model(): @requires_ethosn def test_mobilenet_v1(): + """Compare compile hashes for mobilenetv1 with an expected result.""" # If this test is failing due to a hash mismatch, please notify @lhutton1 and # @Leo-arm. The hash is there to catch any changes in the behaviour of the # codegen, which could come about from either a change in Support Library @@ -137,6 +135,7 @@ def test_mobilenet_v1(): @requires_ethosn def test_resnet_50_int8(): + """Compare compile hashes for resnet50 with an expected result.""" # If this test is failing due to a hash mismatch, please notify @lhutton1 and # @Leo-arm. The hash is there to catch any changes in the behaviour of the # codegen, which could come about from either a change in Support Library @@ -157,6 +156,7 @@ def test_resnet_50_int8(): @requires_ethosn def test_inception_v3(): + """Compare compile hashes for inceptionv3 with an expected result.""" # If this test is failing due to a hash mismatch, please notify @lhutton1 and # @Leo-arm. The hash is there to catch any changes in the behaviour of the # codegen, which could come about from either a change in Support Library @@ -177,6 +177,7 @@ def test_inception_v3(): @requires_ethosn def test_inception_v4(): + """Compare compile hashes for inceptionv4 with an expected result.""" # If this test is failing due to a hash mismatch, please notify @lhutton1 and # @Leo-arm. The hash is there to catch any changes in the behaviour of the # codegen, which could come about from either a change in Support Library @@ -197,6 +198,7 @@ def test_inception_v4(): @requires_ethosn def test_ssd_mobilenet_v1(): + """Compare compile hashes for ssdmobilenetv1 with an expected result.""" # If this test is failing due to a hash mismatch, please notify @lhutton1 and # @Leo-arm. The hash is there to catch any changes in the behaviour of the # codegen, which could come about from either a change in Support Library diff --git a/tests/python/contrib/test_ethosn/test_reshape.py b/tests/python/contrib/test_ethosn/test_reshape.py index e165cea9c63b..2d6eae9b2522 100644 --- a/tests/python/contrib/test_ethosn/test_reshape.py +++ b/tests/python/contrib/test_ethosn/test_reshape.py @@ -17,11 +17,13 @@ """Arm(R) Ethos(TM)-N integration reshape tests""" +import numpy as np +import pytest + import tvm from tvm import relay from tvm.testing import requires_ethosn -import numpy as np -import pytest + from . import infrastructure as tei From 0549a08f4de40a5a0db277cfe1ae00ab22fc9107 Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Fri, 2 Sep 2022 13:52:23 +0100 Subject: [PATCH 099/704] [ETHOSN] Add support for concatenate with negative axis (#12686) Supports offloading concatenate with a negative axis to the NPU. In addition, parameterized the concatenate unit tests. --- .../backend/contrib/ethosn/ethosn_api.cc | 7 ++- .../contrib/test_ethosn/test_concatenate.py | 49 ++++++++++--------- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc index 55e8901dae08..4f01c924cf6e 100644 --- a/src/relay/backend/contrib/ethosn/ethosn_api.cc +++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc @@ -520,7 +520,12 @@ EthosnError EthosnAPI::LeakyReLU(const Expr& expr, LeakyReLUParams* params) { EthosnError EthosnAPI::Concatenate(const Expr& expr, ConcatenateParams* params) { Call call = Downcast(expr); const auto& attrs = call->attrs.as(); - params->concat_info.m_Axis = attrs->axis; + int axis = attrs->axis; + if (axis < 0) { + int output_dims = Downcast(call->checked_type())->shape.size(); + axis = output_dims + axis; + } + params->concat_info.m_Axis = axis; float output_sc; int output_zp; diff --git a/tests/python/contrib/test_ethosn/test_concatenate.py b/tests/python/contrib/test_ethosn/test_concatenate.py index cd4ec7a4e4b2..0389b3c5b103 100644 --- a/tests/python/contrib/test_ethosn/test_concatenate.py +++ b/tests/python/contrib/test_ethosn/test_concatenate.py @@ -56,33 +56,35 @@ def _get_model(shapes, dtype, axis): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) -def test_concatenate(dtype): - """Compare Concatenate output with TVM.""" - - trials = [ +@pytest.mark.parametrize( + "shapes,axis", + [ ([(1, 4), (1, 6)], 1), ([(1, 16, 4), (1, 16, 4)], 1), ([(1, 25, 4, 16)] * 3, 3), ([(1, 25, 4, 16), (1, 25, 5, 16), (1, 25, 6, 16)], 2), - ] - + ([(1, 4), (1, 6)], -1), + ([(1, 16, 4), (1, 16, 4)], -2), + ], +) +def test_concatenate(dtype, shapes, axis): + """Compare Concatenate output with TVM.""" np.random.seed(0) - for shapes, axis in trials: - outputs = [] - inputs = _get_inputs(shapes, dtype) - for npu in [False, True]: - model = _get_model(shapes, dtype, axis) - mod = tei.make_module(model, {}) - outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu)) + + outputs = [] + inputs = _get_inputs(shapes, dtype) + for npu in [False, True]: + model = _get_model(shapes, dtype, axis) + mod = tei.make_module(model, {}) + outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu)) tei.verify(outputs, dtype, 0) @requires_ethosn -def test_concatenate_failure(): - """Check Concatenate error messages.""" - - trials = [ +@pytest.mark.parametrize( + "shapes,dtype,axis,err_msg", + [ ([(1, 4, 4, 4, 4), (1, 4, 4, 4, 4)], "uint8", 1, "dimensions=5, dimensions must be <= 4;"), ( [(1, 4, 4, 4), (1, 4, 4, 4)], @@ -110,9 +112,10 @@ def test_concatenate_failure(): 0, "Concatenation cannot be performed along batch axis (axis 0);", ), - ] - - for shapes, dtype, axis, err_msg in trials: - model = _get_model(shapes, dtype, axis) - mod = tei.make_ethosn_partition(model) - tei.test_error(mod, {}, err_msg) + ], +) +def test_concatenate_failure(shapes, dtype, axis, err_msg): + """Check Concatenate error messages.""" + model = _get_model(shapes, dtype, axis) + mod = tei.make_ethosn_partition(model) + tei.test_error(mod, {}, err_msg) From 7c7b0f7a2fb7833a3afe8900f8b38ccf144f96f0 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Fri, 2 Sep 2022 09:44:22 -0700 Subject: [PATCH 100/704] [ci][tvmbot] Trigger GitHub Actions after merging (#12361) This fixes the issue where merging from GitHub Actions (i.e. with the default `GITHUB_TOKEN`) doesn't trigger post merge GitHub Actions on the commit it creates in `main`. Instead these jobs are triggered manually by a call to the Actions API after the merge has taken place. This also updates the tvmbot testing code (and by extension some of the other CI testing code) to remove the fixtures for each test in favor of constructing them from a single sample at runtime, this makes it a lot easier to add new tests and see what is different between each data sample and clean up the testing anti-patterns that were there before (e.g. `run()` instead of `pytest.mark.parameterize`, but none of the tests in `test_ci.py` have changed) Tested in https://github.com/driazati/tvm/pull/36 which ran https://github.com/driazati/tvm/actions/runs/2881047903 --- ci/scripts/github_tvmbot.py | 22 +- tests/python/ci/sample_prs/pr10786-badci.json | 130 --- .../sample_prs/pr10786-changes-requested.json | 131 --- .../ci/sample_prs/pr10786-co-authors.json | 129 --- .../ci/sample_prs/pr10786-invalid-author.json | 130 --- .../python/ci/sample_prs/pr10786-merges.json | 129 --- .../ci/sample_prs/pr10786-missing-job.json | 129 --- .../ci/sample_prs/pr10786-nottriggered.json | 129 --- .../ci/sample_prs/pr10786-oldreview.json | 129 --- ...{pr10786-ignore-jobs.json => pr10786.json} | 5 +- .../pr11244-unauthorized-comment.json | 103 --- .../ci/sample_prs/pr11267-no-review.json | 144 ---- .../ci/sample_prs/pr11442-rerun-ci.json | 183 ---- tests/python/ci/test_ci.py | 803 ++++++++---------- tests/python/ci/test_tvmbot.py | 400 +++++---- tests/python/ci/test_utils.py | 33 +- 16 files changed, 624 insertions(+), 2105 deletions(-) delete mode 100644 tests/python/ci/sample_prs/pr10786-badci.json delete mode 100644 tests/python/ci/sample_prs/pr10786-changes-requested.json delete mode 100644 tests/python/ci/sample_prs/pr10786-co-authors.json delete mode 100644 tests/python/ci/sample_prs/pr10786-invalid-author.json delete mode 100644 tests/python/ci/sample_prs/pr10786-merges.json delete mode 100644 tests/python/ci/sample_prs/pr10786-missing-job.json delete mode 100644 tests/python/ci/sample_prs/pr10786-nottriggered.json delete mode 100644 tests/python/ci/sample_prs/pr10786-oldreview.json rename tests/python/ci/sample_prs/{pr10786-ignore-jobs.json => pr10786.json} (78%) delete mode 100644 tests/python/ci/sample_prs/pr11244-unauthorized-comment.json delete mode 100644 tests/python/ci/sample_prs/pr11267-no-review.json delete mode 100644 tests/python/ci/sample_prs/pr11442-rerun-ci.json diff --git a/ci/scripts/github_tvmbot.py b/ci/scripts/github_tvmbot.py index 3a39e69694d8..ee9607dd0254 100755 --- a/ci/scripts/github_tvmbot.py +++ b/ci/scripts/github_tvmbot.py @@ -195,6 +195,7 @@ def __init__( self.number = number self.repo_name = repo self.dry_run = dry_run + self.has_error = False if dry_run and raw_data: # In test mode there is no need to fetch anything @@ -468,7 +469,10 @@ def find_missing_expected_jobs(self) -> List[str]: def trigger_gha_ci(self, sha: str) -> None: logging.info(f"POST-ing a workflow_dispatch event to main.yml") - r = self.github.post( + actions_github = GitHubRepo( + user=self.github.user, repo=self.github.repo, token=GH_ACTIONS_TOKEN + ) + r = actions_github.post( url="actions/workflows/main.yml/dispatches", data={ "ref": "main", @@ -537,9 +541,12 @@ def rerun_github_actions(self) -> None: workflow_ids = list(set(workflow_ids)) logging.info(f"Rerunning GitHub Actions workflows with IDs: {workflow_ids}") - actions_github = GitHubRepo( - user=self.github.user, repo=self.github.repo, token=GH_ACTIONS_TOKEN - ) + if self.dry_run: + actions_github = None + else: + actions_github = GitHubRepo( + user=self.github.user, repo=self.github.repo, token=GH_ACTIONS_TOKEN + ) for workflow_id in workflow_ids: if self.dry_run: logging.info(f"Dry run, not restarting workflow {workflow_id}") @@ -576,6 +583,7 @@ def comment_failure(self, msg: str, exceptions: Union[Exception, List[Exception] comment += "" pr.comment(comment) + pr.has_error = True return exception @@ -750,6 +758,9 @@ def run(pr: PR): for name, check in command_to_run.auth: if check(pr, comment, args): logging.info(f"Passed auth check '{name}', continuing") + # Only one authorization check needs to pass (e.g. just mentionable + # or PR author), not all of them so quit + break else: logging.info(f"Failed auth check '{name}', quitting") # Add a sad face @@ -767,3 +778,6 @@ def run(pr: PR): # Run the command command_to_run.run(pr) + + if pr.has_error: + raise RuntimeError("PR commented a failure") diff --git a/tests/python/ci/sample_prs/pr10786-badci.json b/tests/python/ci/sample_prs/pr10786-badci.json deleted file mode 100644 index 7e9d10d0b648..000000000000 --- a/tests/python/ci/sample_prs/pr10786-badci.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "title": "[Hexagon] 2-d allocation cleanup", - "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\". The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`. The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions. Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw ", - "state": "OPEN", - "author": { - "login": "abc" - }, - "comments": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [] - }, - "authorCommits": { - "nodes": [ - { - "commit": { - "authors": { - "nodes": [ - { - "name": "Eric Lunderberg", - "email": "elunderberg@octoml.ai" - }, - { - "name": "Adam Straw", - "email": "astraw@octoml.ai" - } - ] - } - } - } - ] - }, - "commits": { - "nodes": [ - { - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd", - "statusCheckRollup": { - "contexts": { - "pageInfo": { - "hasNextPage": false - }, - "nodes": [ - { - "name": "MacOS", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945392" - }, - { - "name": "cc-reviewers", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "PR" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945029" - }, - { - "name": "tag-teams", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "Teams" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945030" - }, - { - "name": "Windows", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945524" - }, - { - "state": "FAILED", - "context": "tvm-ci/pr-head", - "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect" - } - ] - } - } - } - } - ] - }, - "reviewDecision": "APPROVED", - "reviews": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [ - { - "body": "@tvm-bot merge", - "updatedAt": "2022-03-25T22:13:50Z", - "authorCanPushToRepository": true, - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd" - }, - "id": 123, - "author": { - "login": "kparzysz-quic" - }, - "state": "APPROVED" - } - ] - } -} \ No newline at end of file diff --git a/tests/python/ci/sample_prs/pr10786-changes-requested.json b/tests/python/ci/sample_prs/pr10786-changes-requested.json deleted file mode 100644 index 24e261099a4f..000000000000 --- a/tests/python/ci/sample_prs/pr10786-changes-requested.json +++ /dev/null @@ -1,131 +0,0 @@ -{ - "title": "[Hexagon] 2-d allocation cleanup", - "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\". The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`. The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions. Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw ", - "state": "OPEN", - "author": { - "login": "abc" - }, - "comments": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [] - }, - "authorCommits": { - "nodes": [ - { - "commit": { - "authors": { - "nodes": [ - { - "name": "Eric Lunderberg", - "email": "elunderberg@octoml.ai" - }, - { - "name": "Adam Straw", - "email": "astraw@octoml.ai" - } - ] - } - } - } - ] - }, - "commits": { - "nodes": [ - { - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd", - "statusCheckRollup": { - "contexts": { - "pageInfo": { - "hasNextPage": false - }, - "nodes": [ - { - "name": "MacOS", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945392" - }, - { - "name": "cc-reviewers", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "PR" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945029" - }, - { - "name": "tag-teams", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "Teams" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945030" - }, - { - "name": "Windows", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945524" - }, - { - "state": "SUCCESS", - "context": "tvm-ci/pr-head", - "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect" - } - ] - } - } - } - } - ] - }, - "reviewDecision": "CHANGES_REQUESTED", - "reviews": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [ - { - "body": "@tvm-bot merge", - "updatedAt": "2022-03-25T22:13:50Z", - "url": "https://github.com/apache/tvm/pull/10786#pullrequestreview-922186273", - "authorCanPushToRepository": true, - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd" - }, - "id": 123, - "author": { - "login": "kparzysz-quic" - }, - "state": "CHANGES_REQUESTED" - } - ] - } -} \ No newline at end of file diff --git a/tests/python/ci/sample_prs/pr10786-co-authors.json b/tests/python/ci/sample_prs/pr10786-co-authors.json deleted file mode 100644 index 75f272825059..000000000000 --- a/tests/python/ci/sample_prs/pr10786-co-authors.json +++ /dev/null @@ -1,129 +0,0 @@ -{ - "title": "[Hexagon] 2-d allocation cleanup", - "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\". The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`. The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions. Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw ", - "state": "OPEN", - "author": { - "login": "abc" - }, - "comments": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [] - }, - "authorCommits": { - "nodes": [ - { - "commit": { - "authors": { - "nodes": [ - { - "name": "Eric Lunderberg", - "email": "elunderberg@octoml.ai" - }, - { - "name": "Some One", - "email": "someone@email.com" - } - ] - } - } - } - ] - }, - "commits": { - "nodes": [ - { - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd", - "statusCheckRollup": { - "contexts": { - "pageInfo": { - "hasNextPage": false - }, - "nodes": [ - { - "name": "MacOS", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945392" - }, - { - "name": "cc-reviewers", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "PR" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945029" - }, - { - "name": "tag-teams", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "Teams" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945030" - }, - { - "name": "Windows", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945524" - }, - { - "state": "SUCCESS", - "context": "tvm-ci/pr-head", - "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect" - } - ] - } - } - } - } - ] - }, - "reviewDecision": "APPROVED", - "reviews": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [ - { - "body": "@tvm-bot merge", - "updatedAt": "2022-03-25T22:13:50Z", - "authorCanPushToRepository": true, - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd" - }, - "author": { - "login": "kparzysz-quic" - }, - "state": "APPROVED" - } - ] - } -} \ No newline at end of file diff --git a/tests/python/ci/sample_prs/pr10786-invalid-author.json b/tests/python/ci/sample_prs/pr10786-invalid-author.json deleted file mode 100644 index 81b028e3196a..000000000000 --- a/tests/python/ci/sample_prs/pr10786-invalid-author.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "title": "[Hexagon] 2-d allocation cleanup", - "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\". The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`. The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions. Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw ", - "state": "OPEN", - "author": { - "login": "abc" - }, - "comments": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [] - }, - "authorCommits": { - "nodes": [ - { - "commit": { - "authors": { - "nodes": [ - { - "name": "Eric Lunderberg", - "email": "elunderberg@octoml.ai" - }, - { - "name": "Adam Straw", - "email": "astraw@octoml.ai" - } - ] - } - } - } - ] - }, - "commits": { - "nodes": [ - { - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd", - "statusCheckRollup": { - "contexts": { - "pageInfo": { - "hasNextPage": false - }, - "nodes": [ - { - "name": "MacOS", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945392" - }, - { - "name": "cc-reviewers", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "PR" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945029" - }, - { - "name": "tag-teams", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "Teams" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945030" - }, - { - "name": "Windows", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945524" - }, - { - "state": "SUCCESS", - "context": "tvm-ci/pr-head", - "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect" - } - ] - } - } - } - } - ] - }, - "reviewDecision": "APPROVED", - "reviews": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [ - { - "body": "@tvm-bot merge", - "id": 123, - "updatedAt": "2022-03-25T22:13:50Z", - "authorCanPushToRepository": false, - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd" - }, - "author": { - "login": "kparzysz-quic" - }, - "state": "APPROVED" - } - ] - } -} \ No newline at end of file diff --git a/tests/python/ci/sample_prs/pr10786-merges.json b/tests/python/ci/sample_prs/pr10786-merges.json deleted file mode 100644 index 0226c8ab5245..000000000000 --- a/tests/python/ci/sample_prs/pr10786-merges.json +++ /dev/null @@ -1,129 +0,0 @@ -{ - "title": "[Hexagon] 2-d allocation cleanup", - "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\". The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`. The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free.\n\n\nThanks for contributing to TVM! Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.\n\n\nPreviously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions. Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\n\n\ncc @someone\n\r\n\r\nCo-authored-by: Adam Straw \n\n\nThanks for contributing to TVM! Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.\n\n", - "state": "OPEN", - "author": { - "login": "abc" - }, - "comments": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [] - }, - "authorCommits": { - "nodes": [ - { - "commit": { - "authors": { - "nodes": [ - { - "name": "Eric Lunderberg", - "email": "elunderberg@octoml.ai" - }, - { - "name": "Adam Straw", - "email": "astraw@octoml.ai" - } - ] - } - } - } - ] - }, - "commits": { - "nodes": [ - { - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd", - "statusCheckRollup": { - "contexts": { - "pageInfo": { - "hasNextPage": false - }, - "nodes": [ - { - "name": "MacOS", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945392" - }, - { - "name": "cc-reviewers", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "PR" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945029" - }, - { - "name": "tag-teams", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "Teams" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945030" - }, - { - "name": "Windows", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945524" - }, - { - "state": "SUCCESS", - "context": "tvm-ci/pr-head", - "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect" - } - ] - } - } - } - } - ] - }, - "reviewDecision": "APPROVED", - "reviews": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [ - { - "body": "@tvm-bot merge", - "updatedAt": "2022-03-25T22:13:50Z", - "authorCanPushToRepository": true, - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd" - }, - "author": { - "login": "kparzysz-quic" - }, - "state": "APPROVED" - } - ] - } -} \ No newline at end of file diff --git a/tests/python/ci/sample_prs/pr10786-missing-job.json b/tests/python/ci/sample_prs/pr10786-missing-job.json deleted file mode 100644 index 13739b793fb5..000000000000 --- a/tests/python/ci/sample_prs/pr10786-missing-job.json +++ /dev/null @@ -1,129 +0,0 @@ -{ - "title": "[Hexagon] 2-d allocation cleanup", - "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\". The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`. The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions. Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw ", - "state": "OPEN", - "author": { - "login": "abc" - }, - "comments": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [] - }, - "authorCommits": { - "nodes": [ - { - "commit": { - "authors": { - "nodes": [ - { - "name": "Eric Lunderberg", - "email": "elunderberg@octoml.ai" - }, - { - "name": "Adam Straw", - "email": "astraw@octoml.ai" - } - ] - } - } - } - ] - }, - "commits": { - "nodes": [ - { - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd", - "statusCheckRollup": { - "contexts": { - "pageInfo": { - "hasNextPage": false - }, - "nodes": [ - { - "name": "MacOS", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945392" - }, - { - "name": "cc-reviewers", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "PR" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945029" - }, - { - "name": "tag-teams", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "Teams" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945030" - }, - { - "name": "Windows", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945524" - }, - { - "state": "SUCCESS", - "context": "tvm-ci/definitely-not-pr-head", - "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect" - } - ] - } - } - } - } - ] - }, - "reviewDecision": "APPROVED", - "reviews": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [ - { - "body": "@tvm-bot merge", - "updatedAt": "2022-03-25T22:13:50Z", - "authorCanPushToRepository": true, - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd" - }, - "author": { - "login": "kparzysz-quic" - }, - "state": "APPROVED" - } - ] - } -} \ No newline at end of file diff --git a/tests/python/ci/sample_prs/pr10786-nottriggered.json b/tests/python/ci/sample_prs/pr10786-nottriggered.json deleted file mode 100644 index 0da541c4342d..000000000000 --- a/tests/python/ci/sample_prs/pr10786-nottriggered.json +++ /dev/null @@ -1,129 +0,0 @@ -{ - "title": "[Hexagon] 2-d allocation cleanup", - "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\". The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`. The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions. Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw ", - "state": "OPEN", - "author": { - "login": "abc" - }, - "comments": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [] - }, - "authorCommits": { - "nodes": [ - { - "commit": { - "authors": { - "nodes": [ - { - "name": "Eric Lunderberg", - "email": "elunderberg@octoml.ai" - }, - { - "name": "Adam Straw", - "email": "astraw@octoml.ai" - } - ] - } - } - } - ] - }, - "commits": { - "nodes": [ - { - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd", - "statusCheckRollup": { - "contexts": { - "pageInfo": { - "hasNextPage": false - }, - "nodes": [ - { - "name": "MacOS", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945392" - }, - { - "name": "cc-reviewers", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "PR" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945029" - }, - { - "name": "tag-teams", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "Teams" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945030" - }, - { - "name": "Windows", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945524" - }, - { - "state": "SUCCESS", - "context": "tvm-ci/pr-head", - "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect" - } - ] - } - } - } - } - ] - }, - "reviewDecision": "APPROVED", - "reviews": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [ - { - "body": "", - "updatedAt": "2022-03-25T22:13:50Z", - "authorCanPushToRepository": true, - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd" - }, - "author": { - "login": "kparzysz-quic" - }, - "state": "APPROVED" - } - ] - } -} \ No newline at end of file diff --git a/tests/python/ci/sample_prs/pr10786-oldreview.json b/tests/python/ci/sample_prs/pr10786-oldreview.json deleted file mode 100644 index 1a2556cb6f5f..000000000000 --- a/tests/python/ci/sample_prs/pr10786-oldreview.json +++ /dev/null @@ -1,129 +0,0 @@ -{ - "title": "[Hexagon] 2-d allocation cleanup", - "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\". The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`. The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions. Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw ", - "state": "OPEN", - "author": { - "login": "abc" - }, - "comments": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [] - }, - "authorCommits": { - "nodes": [ - { - "commit": { - "authors": { - "nodes": [ - { - "name": "Eric Lunderberg", - "email": "elunderberg@octoml.ai" - }, - { - "name": "Adam Straw", - "email": "astraw@octoml.ai" - } - ] - } - } - } - ] - }, - "commits": { - "nodes": [ - { - "commit": { - "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd", - "statusCheckRollup": { - "contexts": { - "pageInfo": { - "hasNextPage": false - }, - "nodes": [ - { - "name": "MacOS", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945392" - }, - { - "name": "cc-reviewers", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "PR" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945029" - }, - { - "name": "tag-teams", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "Teams" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945030" - }, - { - "name": "Windows", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/5694945524" - }, - { - "state": "SUCCESS", - "context": "tvm-ci/pr-head", - "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect" - } - ] - } - } - } - } - ] - }, - "reviewDecision": "APPROVED", - "reviews": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [ - { - "body": "@tvm-bot merge", - "updatedAt": "2022-03-25T22:13:50Z", - "authorCanPushToRepository": true, - "commit": { - "oid": "abc12345" - }, - "author": { - "login": "kparzysz-quic" - }, - "state": "APPROVED" - } - ] - } -} \ No newline at end of file diff --git a/tests/python/ci/sample_prs/pr10786-ignore-jobs.json b/tests/python/ci/sample_prs/pr10786.json similarity index 78% rename from tests/python/ci/sample_prs/pr10786-ignore-jobs.json rename to tests/python/ci/sample_prs/pr10786.json index dfcd806ff14b..79f20ca6094b 100644 --- a/tests/python/ci/sample_prs/pr10786-ignore-jobs.json +++ b/tests/python/ci/sample_prs/pr10786.json @@ -1,6 +1,6 @@ { "title": "[Hexagon] 2-d allocation cleanup", - "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\". The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`. The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions. Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw ", + "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\". The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`. The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free.\n\n\nThanks for contributing to TVM! Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.\n\n\nPreviously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions. Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\n\n\ncc @someone\n\r\n\r\nCo-authored-by: Adam Straw \n\n\nThanks for contributing to TVM! Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.\n\n", "state": "OPEN", "author": { "login": "abc" @@ -65,7 +65,7 @@ } }, "status": "COMPLETED", - "conclusion": "FAILED", + "conclusion": "SUCCESS", "url": "https://github.com/apache/tvm/runs/5694945029" }, { @@ -119,7 +119,6 @@ "commit": { "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd" }, - "id": 123, "author": { "login": "kparzysz-quic" }, diff --git a/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json b/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json deleted file mode 100644 index beafc05958b6..000000000000 --- a/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json +++ /dev/null @@ -1,103 +0,0 @@ -{ - "title": "[CRT runtime] Added functions TVMPlatformPreFuncCall and TVMPlatformPostFuncCall", - "body": "See [this thread ](https://discuss.tvm.apache.org/t/crt-add-platform-specific-pre-and-post-function-calls-in-crt-runtime/12723)for an explanation.", - "state": "OPEN", - "author": { - "login": "abc" - }, - "comments": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [ - { - "authorAssociation": "NONE", - "author": { - "login": "abc" - }, - "updatedAt": "2022-05-09T13:39:04Z", - "body": "@tvm-bot merge" - }, - { - "authorAssociation": "CONTRIBUTOR", - "author": { - "login": "areusch" - }, - "updatedAt": "2022-05-11T19:22:01Z", - "body": "i commented on the discuss forum thread. let's resolve there and then continue this PR." - } - ] - }, - "authorCommits": { - "nodes": [ - { - "commit": { - "authors": { - "nodes": [ - { - "name": "Federico Peccia", - "email": "peccia@fzi.de" - } - ] - } - } - } - ] - }, - "commits": { - "nodes": [ - { - "commit": { - "oid": "79d355c5f837b3bdadb5d25b2a5d0d2802783ae2", - "statusCheckRollup": { - "contexts": { - "pageInfo": { - "hasNextPage": false - }, - "nodes": [ - { - "name": "cc-reviewers", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "PR" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/6352791017" - }, - { - "name": "tag-teams", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "Teams" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/6352791014" - }, - { - "state": "ERROR", - "context": "tvm-ci/pr-head", - "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-11244/1/display/redirect" - } - ] - } - } - } - } - ] - }, - "reviewDecision": "REVIEW_REQUIRED", - "reviews": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [] - } -} \ No newline at end of file diff --git a/tests/python/ci/sample_prs/pr11267-no-review.json b/tests/python/ci/sample_prs/pr11267-no-review.json deleted file mode 100644 index d2ad164673e5..000000000000 --- a/tests/python/ci/sample_prs/pr11267-no-review.json +++ /dev/null @@ -1,144 +0,0 @@ -{ - "title": "[ci][docker] Use sccache everywhere by default", - "body": "This adds `/opt/sccache` to the PATH of each of the CI docker images so when cmake looks for a C compiler it will pick up the sccache wrapper by default. This fixes some issues where compiler invocations weren't being run though sccache. With this approach the invoker doesn't need to do anything specific to set up sccache.\n\nThis will require a follow up PR to update the Docker images and remove some of the sccache logic in `task_build.py`\n\n\n\ncc @Mousius @areusch", - "state": "OPEN", - "author": { - "login": "abc" - }, - "comments": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [ - { - "authorAssociation": "CONTRIBUTOR", - "author": { - "login": "areusch" - }, - "id": 124, - "updatedAt": "2022-05-11T16:54:32Z", - "body": "just confirming--we can disable this when doing a local build, correct? what's the mechanism by which we do that?" - }, - { - "authorAssociation": "COLLABORATOR", - "author": { - "login": "driazati" - }, - "id": 123, - "updatedAt": "2022-05-11T18:46:54Z", - "body": "@tvm-bot merge" - } - ] - }, - "authorCommits": { - "nodes": [ - { - "commit": { - "authors": { - "nodes": [ - { - "name": "driazati", - "email": "driazati@users.noreply.github.com" - } - ] - } - } - } - ] - }, - "commits": { - "nodes": [ - { - "commit": { - "oid": "bb7f51d3e0fd50997012dfcce3c9b2b852cd3136", - "statusCheckRollup": { - "contexts": { - "pageInfo": { - "hasNextPage": false - }, - "nodes": [ - { - "name": "MacOS", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/6377784092" - }, - { - "name": "cc-reviewers", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "PR" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/6377778488" - }, - { - "name": "cc-reviewers", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "PR" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/6390508806" - }, - { - "name": "tag-teams", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "Teams" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/6390511833" - }, - { - "name": "Windows", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/6377784248" - }, - { - "state": "SUCCESS", - "context": "tvm-ci/pr-head", - "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-11267/2/display/redirect" - } - ] - } - } - } - } - ] - }, - "reviewDecision": "REVIEW_REQUIRED", - "reviews": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [] - } -} \ No newline at end of file diff --git a/tests/python/ci/sample_prs/pr11442-rerun-ci.json b/tests/python/ci/sample_prs/pr11442-rerun-ci.json deleted file mode 100644 index 0199b2921f64..000000000000 --- a/tests/python/ci/sample_prs/pr11442-rerun-ci.json +++ /dev/null @@ -1,183 +0,0 @@ -{ - "title": "Add 'static_library' runtime::Module", - "body": "(See https://discuss.tvm.apache.org/t/byoc-supporting-cutlass-byoc-with-collage/12796/6 for\r\ncontext, which in turn is part of Collage (https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md).\r\n\r\nThis adds a new 'DSO exportable' runtime module representing the contents of a .o file. It\r\nallows external codegen toolchains to yield a result which:\r\n - Like CSource modules, can be conveyed directly to the final export_library compilation\r\n step for linking into the final .so and saved to a know location without risk the\r\n underlying code artifact will be lost.\r\n - Like DSOLibrary modules, are self contained so that no additional compile-time arguments\r\n need be conveyed from the CSource module to the final export_library command line\r\n\r\nSince this is the third flavor of 'DSO exportable' module, add a Module::IsDSOExportable.\r\n\r\nSince adding the above, can't resist also adding a Module::ImplementsFunction virtual and\r\ncalling it from TEComplier to check if an external codegen function actually provided the\r\nimplementation it promised.\r\n\r\nNote:\r\n - I've left the existing implementation of runtime.load_module alone which\r\n relinks .o files to .so files.\r\n - Though also contained in the .o metadata, I require static libraries to always\r\n carry their list of exported function names.\r\n\r\nThis is all pretty stop gap pending a good rework of TVM to supoprt the notion of artifacts\r\nand, perhaps, build rules.\r\n", - "state": "OPEN", - "author": { - "login": "abc" - }, - "comments": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [ - { - "authorAssociation": "MEMBER", - "author": { - "login": "tqchen" - }, - "updatedAt": "2022-05-24T22:13:29Z", - "body": "Thanks @mbs-octoml . I think we go with this as a temp workaround with a mind that the IsDSOExportable and ImplementsFunction likely should go to Artifact." - }, - { - "authorAssociation": "CONTRIBUTOR", - "author": { - "login": "mbs-octoml" - }, - "updatedAt": "2022-05-24T22:56:07Z", - "body": "Yeah, we really need to put some love into that.\r\n\r\nCollecting all the pieces needed for deployment along with their metadata a la Artifact is pretty clearly needed, though I suspect that will need to be abstract to cover the spectrum from firmware image to dynamically loadable .so to ready-to-call JITed code to tar.\r\n\r\nI can't help thinking we should also think about build rules guarded by target kinds & attributes, since again there's just so may ways to proceed." - }, - { - "authorAssociation": "MEMBER", - "author": { - "login": "tqchen" - }, - "updatedAt": "2022-05-24T23:08:00Z", - "body": "Perhaps we will end up building our own cmake/bazel :p in another time" - }, - { - "authorAssociation": "CONTRIBUTOR", - "author": { - "login": "mbs-octoml" - }, - "updatedAt": "2022-05-25T22:11:44Z", - "body": "Thanks Tianqi. Let's see if this new fancy bot works...\r\n\r\n" - }, - { - "authorAssociation": "CONTRIBUTOR", - "author": { - "login": "mbs-octoml" - }, - "updatedAt": "2022-05-25T22:11:50Z", - "body": "@tvm-bot merge" - }, - { - "authorAssociation": "NONE", - "author": { - "login": "github-actions" - }, - "updatedAt": "2022-05-25T22:12:10Z", - "body": "Cannot merge, did not find any approving reviews from users with write access on 96d4e62da5a7b78da18d0ee28cc6261d8fbf31c4" - }, - { - "authorAssociation": "CONTRIBUTOR", - "author": { - "login": "mbs-octoml" - }, - "updatedAt": "2022-05-25T22:12:37Z", - "body": "@tvm-bot rerun" - } - ] - }, - "authorCommits": { - "nodes": [ - { - "commit": { - "authors": { - "nodes": [ - { - "name": "mbs-octoml", - "email": "mbs@octoml.ai" - } - ] - } - } - } - ] - }, - "commits": { - "nodes": [ - { - "commit": { - "oid": "96d4e62da5a7b78da18d0ee28cc6261d8fbf31c4", - "statusCheckRollup": { - "contexts": { - "pageInfo": { - "hasNextPage": false - }, - "nodes": [ - { - "name": "MacOS", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/6598275844" - }, - { - "name": "cc-reviewers", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "PR" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/6598273162" - }, - { - "name": "Windows", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/6598275717" - }, - { - "name": "Android", - "checkSuite": { - "workflowRun": { - "workflow": { - "name": "CI" - } - } - }, - "status": "COMPLETED", - "conclusion": "SUCCESS", - "url": "https://github.com/apache/tvm/runs/6598275593" - }, - { - "state": "SUCCESS", - "context": "tvm-ci/pr-head", - "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-11442/4/display/redirect" - } - ] - } - } - } - } - ] - }, - "reviewDecision": "APPROVED", - "reviews": { - "pageInfo": { - "hasPreviousPage": false - }, - "nodes": [ - { - "body": "", - "updatedAt": "2022-05-24T23:08:31Z", - "url": "https://github.com/apache/tvm/pull/11442#pullrequestreview-983954561", - "authorCanPushToRepository": true, - "commit": { - "oid": "23c600097cf1c2a55acda059626a060e106dd023" - }, - "author": { - "login": "tqchen" - }, - "state": "APPROVED" - } - ] - } -} \ No newline at end of file diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py index 0939aae10ab5..f2e686d1e582 100644 --- a/tests/python/ci/test_ci.py +++ b/tests/python/ci/test_ci.py @@ -23,15 +23,14 @@ import pytest import tvm.testing -from .test_utils import REPO_ROOT, TempGit +from .test_utils import REPO_ROOT, TempGit, run_script -def parameterize_named(*values): - keys = list(values[0].keys()) - if len(keys) == 1: - return pytest.mark.parametrize(",".join(keys), [d[keys[0]] for d in values]) - - return pytest.mark.parametrize(",".join(keys), [tuple(d.values()) for d in values]) +def parameterize_named(**kwargs): + keys = next(iter(kwargs.values())).keys() + return pytest.mark.parametrize( + ",".join(keys), [tuple(d.values()) for d in kwargs.values()], ids=kwargs.keys() + ) # pylint: disable=line-too-long @@ -137,23 +136,7 @@ def parameterize_named(*values): @tvm.testing.skip_if_wheel_test -@pytest.mark.parametrize( - [ - "main_xml_file", - "main_xml_content", - "pr_xml_file", - "pr_xml_content", - "target_url", - "s3_prefix", - "jenkins_prefix", - "common_main_build", - "commit_sha", - "expected_url", - "expected_body", - ], - [tuple(d.values()) for d in TEST_DATA_SKIPPED_BOT.values()], - ids=TEST_DATA_SKIPPED_BOT.keys(), -) +@parameterize_named(**TEST_DATA_SKIPPED_BOT) # pylint: enable=line-too-long def test_skipped_tests_comment( tmpdir_factory, @@ -182,49 +165,37 @@ def write_xml_file(root_dir, xml_file, xml_content): f.write(textwrap.dedent(xml_content)) git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) - git.run("init") - git.run("checkout", "-b", "main") - git.run("remote", "add", "origin", "https://github.com/apache/tvm.git") pr_test_report_dir = Path(git.cwd) / "pr-reports" write_xml_file(pr_test_report_dir, pr_xml_file, pr_xml_content) main_test_report_dir = Path(git.cwd) / "main-reports" write_xml_file(main_test_report_dir, main_xml_file, main_xml_content) - proc = subprocess.run( + proc = run_script( [ - str(skipped_tests_script), + skipped_tests_script, "--dry-run", f"--s3-prefix={s3_prefix}", f"--jenkins-prefix={jenkins_prefix}", f"--common-main-build={common_main_build}", ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, env={"TARGET_URL": target_url, "COMMIT_SHA": commit_sha}, - encoding="utf-8", cwd=git.cwd, - check=False, ) - if proc.returncode != 0: - raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}") - assert f"Dry run, would have posted {expected_url} with data {expected_body}." in proc.stderr + assert_in(f"Dry run, would have posted {expected_url} with data {expected_body}.", proc.stderr) @tvm.testing.skip_if_wheel_test -@pytest.mark.parametrize( - "target_url,base_url,commit_sha,expected_url,expected_body", - [ - ( - "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect", - "https://pr-docs.tlcpack.ai", - "SHA", - "issues/11594/comments", - "\n\nBuilt docs for commit SHA can be found " - "[here](https://pr-docs.tlcpack.ai/PR-11594/3/docs/index.html).", - ) - ], +@parameterize_named( + doc_link=dict( + target_url="https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect", + base_url="https://pr-docs.tlcpack.ai", + commit_sha="SHA", + expected_url="issues/11594/comments", + expected_body="\n\nBuilt docs for commit SHA can be found " + "[here](https://pr-docs.tlcpack.ai/PR-11594/3/docs/index.html).", + ) ) def test_docs_comment( tmpdir_factory, target_url, base_url, commit_sha, expected_url, expected_body @@ -235,146 +206,93 @@ def test_docs_comment( docs_comment_script = REPO_ROOT / "ci" / "scripts" / "github_docs_comment.py" git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) - git.run("init") - git.run("checkout", "-b", "main") - git.run("remote", "add", "origin", "https://github.com/apache/tvm.git") - proc = subprocess.run( - [str(docs_comment_script), "--dry-run", f"--base-url-docs={base_url}"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + proc = run_script( + [docs_comment_script, "--dry-run", f"--base-url-docs={base_url}"], env={"TARGET_URL": target_url, "COMMIT_SHA": commit_sha}, - encoding="utf-8", cwd=git.cwd, - check=False, ) - if proc.returncode != 0: - raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}") - assert f"Dry run, would have posted {expected_url} with data {expected_body}." in proc.stderr + assert_in(f"Dry run, would have posted {expected_url} with data {expected_body}.", proc.stderr) @tvm.testing.skip_if_wheel_test -def test_cc_reviewers(tmpdir_factory): - """ - Test that reviewers are added from 'cc @someone' messages in PRs - """ - reviewers_script = REPO_ROOT / "ci" / "scripts" / "github_cc_reviewers.py" - - def run(pr_body, requested_reviewers, existing_review_users, expected_reviewers): - git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) - git.run("init") - git.run("checkout", "-b", "main") - git.run("remote", "add", "origin", "https://github.com/apache/tvm.git") - reviews = [{"user": {"login": r}} for r in existing_review_users] - requested_reviewers = [{"login": r} for r in requested_reviewers] - proc = subprocess.run( - [str(reviewers_script), "--dry-run", "--testing-reviews-json", json.dumps(reviews)], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env={ - "PR": json.dumps( - {"number": 1, "body": pr_body, "requested_reviewers": requested_reviewers} - ) - }, - encoding="utf-8", - cwd=git.cwd, - check=False, - ) - if proc.returncode != 0: - raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}") - - assert f"After filtering existing reviewers, adding: {expected_reviewers}" in proc.stdout - - run(pr_body="abc", requested_reviewers=[], existing_review_users=[], expected_reviewers=[]) - run( +@parameterize_named( + cc_no_one=dict( + pr_body="abc", requested_reviewers=[], existing_review_users=[], expected_reviewers=[] + ), + cc_abc=dict( pr_body="cc @abc", requested_reviewers=[], existing_review_users=[], expected_reviewers=["abc"], - ) - run(pr_body="cc @", requested_reviewers=[], existing_review_users=[], expected_reviewers=[]) - run( + ), + bad_cc_line=dict( + pr_body="cc @", requested_reviewers=[], existing_review_users=[], expected_reviewers=[] + ), + cc_multiple=dict( pr_body="cc @abc @def", requested_reviewers=[], existing_review_users=[], expected_reviewers=["abc", "def"], - ) - run( + ), + with_existing=dict( pr_body="some text cc @abc @def something else", requested_reviewers=[], existing_review_users=[], expected_reviewers=["abc", "def"], - ) - run( + ), + with_existing_split=dict( pr_body="some text cc @abc @def something else\n\n another cc @zzz z", requested_reviewers=[], existing_review_users=[], expected_reviewers=["abc", "def", "zzz"], - ) - run( + ), + with_existing_request=dict( pr_body="some text cc @abc @def something else\n\n another cc @zzz z", requested_reviewers=["abc"], existing_review_users=[], expected_reviewers=["def", "zzz"], - ) - run( + ), + with_existing_reviewers=dict( pr_body="some text cc @abc @def something else\n\n another cc @zzz z", requested_reviewers=["abc"], existing_review_users=["abc"], expected_reviewers=["def", "zzz"], - ) - run( + ), + with_no_reviewers=dict( pr_body="some text cc @abc @def something else\n\n another cc @zzz z", requested_reviewers=[], existing_review_users=["abc"], expected_reviewers=["def", "zzz"], - ) - - -def test_update_branch(tmpdir_factory): + ), +) +def test_cc_reviewers( + tmpdir_factory, pr_body, requested_reviewers, existing_review_users, expected_reviewers +): """ - Test that the last-successful branch script updates successfully + Test that reviewers are added from 'cc @someone' messages in PRs """ - update_script = REPO_ROOT / "ci" / "scripts" / "update_branch.py" - - def run(statuses, expected_rc, expected_output): - git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) - git.run("init") - git.run("checkout", "-b", "main") - git.run("remote", "add", "origin", "https://github.com/apache/tvm.git") - commit = { - "statusCheckRollup": {"contexts": {"nodes": statuses}}, - "oid": "123", - "messageHeadline": "hello", - } - data = { - "data": { - "repository": { - "defaultBranchRef": {"target": {"history": {"edges": [], "nodes": [commit]}}} - } - } - } - proc = subprocess.run( - [str(update_script), "--dry-run", "--testonly-json", json.dumps(data)], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - encoding="utf-8", - cwd=git.cwd, - check=False, - ) + reviewers_script = REPO_ROOT / "ci" / "scripts" / "github_cc_reviewers.py" - if proc.returncode != expected_rc: - raise RuntimeError( - f"Wrong return code:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}" + git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) + reviews = [{"user": {"login": r}} for r in existing_review_users] + requested_reviewers = [{"login": r} for r in requested_reviewers] + proc = run_script( + [reviewers_script, "--dry-run", "--testing-reviews-json", json.dumps(reviews)], + env={ + "PR": json.dumps( + {"number": 1, "body": pr_body, "requested_reviewers": requested_reviewers} ) + }, + cwd=git.cwd, + ) - if expected_output not in proc.stdout: - raise RuntimeError( - f"Missing {expected_output}:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}" - ) + assert f"After filtering existing reviewers, adding: {expected_reviewers}" in proc.stdout + +@parameterize_named( # Missing expected tvm-ci/branch test - run( + missing_tvm_ci_branch=dict( statuses=[ { "context": "test", @@ -383,10 +301,9 @@ def run(statuses, expected_rc, expected_output): ], expected_rc=1, expected_output="No good commits found in the last 1 commits", - ) - + ), # Only has the right passing test - run( + has_expected_test=dict( statuses=[ { "context": "tvm-ci/branch", @@ -395,10 +312,9 @@ def run(statuses, expected_rc, expected_output): ], expected_rc=0, expected_output="Found last good commit: 123: hello", - ) - + ), # Check with many statuses - run( + many_statuses=dict( statuses=[ { "context": "tvm-ci/branch", @@ -415,8 +331,8 @@ def run(statuses, expected_rc, expected_output): ], expected_rc=1, expected_output="No good commits found in the last 1 commits", - ) - run( + ), + many_success_statuses=dict( statuses=[ { "context": "tvm-ci/branch", @@ -433,17 +349,50 @@ def run(statuses, expected_rc, expected_output): ], expected_rc=0, expected_output="Found last good commit: 123: hello", + ), +) +def test_update_branch(tmpdir_factory, statuses, expected_rc, expected_output): + """ + Test that the last-successful branch script updates successfully + """ + update_script = REPO_ROOT / "ci" / "scripts" / "update_branch.py" + + git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) + commit = { + "statusCheckRollup": {"contexts": {"nodes": statuses}}, + "oid": "123", + "messageHeadline": "hello", + } + data = { + "data": { + "repository": { + "defaultBranchRef": {"target": {"history": {"edges": [], "nodes": [commit]}}} + } + } + } + proc = run_script( + [update_script, "--dry-run", "--testonly-json", json.dumps(data)], + cwd=git.cwd, + check=False, ) + if proc.returncode != expected_rc: + raise RuntimeError(f"Wrong return code:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}") + + if expected_output not in proc.stdout: + raise RuntimeError( + f"Missing {expected_output}:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}" + ) + @parameterize_named( - dict( + dont_skip_main=dict( commands=[], should_skip=False, pr_title="[skip ci] test", why="ci should not be skipped on main", ), - dict( + dont_skip_main_with_commit=dict( commands=[ ["commit", "--allow-empty", "--message", "[skip ci] commit 1"], ], @@ -451,7 +400,7 @@ def run(statuses, expected_rc, expected_output): pr_title="[skip ci] test", why="ci should not be skipped on main", ), - dict( + skip_on_new_branch=dict( commands=[ ["checkout", "-b", "some_new_branch"], ["commit", "--allow-empty", "--message", "[skip ci] commit 1"], @@ -460,7 +409,7 @@ def run(statuses, expected_rc, expected_output): pr_title="[skip ci] test", why="ci should be skipped on a branch with [skip ci] in the last commit", ), - dict( + no_skip_in_pr_title=dict( commands=[ ["checkout", "-b", "some_new_branch"], ["commit", "--allow-empty", "--message", "[skip ci] commit 1"], @@ -470,7 +419,7 @@ def run(statuses, expected_rc, expected_output): why="ci should not be skipped on a branch with " "[skip ci] in the last commit but not the PR title", ), - dict( + skip_in_pr_title=dict( commands=[ ["checkout", "-b", "some_new_branch"], ["commit", "--allow-empty", "--message", "[skip ci] commit 1"], @@ -480,17 +429,7 @@ def run(statuses, expected_rc, expected_output): pr_title="[skip ci] test", why="ci should be skipped with [skip ci] in the PR title", ), - dict( - commands=[ - ["checkout", "-b", "some_new_branch"], - ["commit", "--allow-empty", "--message", "[skip ci] commit 1"], - ["commit", "--allow-empty", "--message", "commit 2"], - ], - should_skip=True, - pr_title="[skip ci] test", - why="ci should be skipped with [skip ci] in the PR title", - ), - dict( + skip_in_pr_title_many_commits=dict( commands=[ ["checkout", "-b", "some_new_branch"], ["commit", "--allow-empty", "--message", "commit 1"], @@ -502,7 +441,7 @@ def run(statuses, expected_rc, expected_output): pr_title="[skip ci] test", why="ci should be skipped with [skip ci] in the PR title", ), - dict( + skip_anywhere_in_title=dict( commands=[ ["checkout", "-b", "some_new_branch"], ], @@ -518,22 +457,16 @@ def test_skip_ci(tmpdir_factory, commands, should_skip, pr_title, why): skip_ci_script = REPO_ROOT / "ci" / "scripts" / "git_skip_ci.py" git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) - # Jenkins git is too old and doesn't have 'git init --initial-branch' - git.run("init") - git.run("checkout", "-b", "main") - git.run("remote", "add", "origin", "https://github.com/apache/tvm.git") + git.run("config", "user.name", "ci") git.run("config", "user.email", "email@example.com") git.run("commit", "--allow-empty", "--message", "base commit") for command in commands: git.run(*command) pr_number = "1234" - proc = subprocess.run( - [str(skip_ci_script), "--pr", pr_number, "--pr-title", pr_title], + proc = run_script( + [skip_ci_script, "--pr", pr_number, "--pr-title", pr_title], cwd=git.cwd, - stderr=subprocess.STDOUT, - stdout=subprocess.PIPE, - encoding="utf-8", check=False, ) expected = 0 if should_skip else 1 @@ -544,120 +477,66 @@ def test_skip_ci(tmpdir_factory, commands, should_skip, pr_title, why): ) -def test_skip_globs(tmpdir_factory): +@parameterize_named( + no_file=dict(files=[], should_skip=True), + readme=dict(files=["README.md"], should_skip=True), + c_file=dict(files=["test.c"], should_skip=False), + c_and_readme=dict(files=["test.c", "README.md"], should_skip=False), + src_file_and_readme=dict( + files=["src/autotvm/feature_visitor.cc", "README.md"], should_skip=False + ), + yaml_and_readme=dict(files=[".asf.yaml", "docs/README.md"], should_skip=True), +) +def test_skip_globs(tmpdir_factory, files, should_skip): """ Test that CI is skipped if only certain files are edited """ script = REPO_ROOT / "ci" / "scripts" / "git_skip_ci_globs.py" - def run(files, should_skip): - git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) - # Jenkins git is too old and doesn't have 'git init --initial-branch' - git.run("init") - git.run("checkout", "-b", "main") - git.run("remote", "add", "origin", "https://github.com/apache/tvm.git") - - proc = subprocess.run( - [ - str(script), - "--files", - ",".join(files), - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - encoding="utf-8", - cwd=git.cwd, - check=False, - ) - - if should_skip: - assert proc.returncode == 0 - else: - assert proc.returncode == 1 - - run([], should_skip=True) - run(["README.md"], should_skip=True) - run(["test.c"], should_skip=False) - run(["test.c", "README.md"], should_skip=False) - run(["src/autotvm/feature_visitor.cc", "README.md"], should_skip=False) - run([".asf.yaml", "docs/README.md"], should_skip=True) + git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) + proc = run_script( + [ + script, + "--files", + ",".join(files), + ], + check=False, + cwd=git.cwd, + ) -def test_ping_reviewers(tmpdir_factory): - """ - Test that reviewers are messaged after a time period of inactivity - """ - reviewers_script = REPO_ROOT / "ci" / "scripts" / "ping_reviewers.py" + if should_skip: + assert proc.returncode == 0 + else: + assert proc.returncode == 1 - def run(pull_request, check): - git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) - # Jenkins git is too old and doesn't have 'git init --initial-branch' - git.run("init") - git.run("checkout", "-b", "main") - git.run("remote", "add", "origin", "https://github.com/apache/tvm.git") - - data = { - "data": { - "repository": { - "pullRequests": { - "nodes": [pull_request], - "edges": [], - } - } - } - } - proc = subprocess.run( - [ - str(reviewers_script), - "--dry-run", - "--wait-time-minutes", - "1", - "--cutoff-pr-number", - "5", - "--allowlist", - "user", - "--pr-json", - json.dumps(data), - "--now", - "2022-01-26T17:54:19Z", - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - encoding="utf-8", - cwd=git.cwd, - check=False, - ) - if proc.returncode != 0: - raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}") - assert check in proc.stdout +def all_time_keys(time): + return { + "updatedAt": time, + "lastEditedAt": time, + "createdAt": time, + "publishedAt": time, + } - def all_time_keys(time): - return { - "updatedAt": time, - "lastEditedAt": time, - "createdAt": time, - "publishedAt": time, - } - run( - { +@parameterize_named( + draft=dict( + pull_request={ "isDraft": True, "number": 2, }, - "Checking 0 of 1 fetched", - ) - - run( - { + check="Checking 0 of 1 fetched", + ), + not_draft=dict( + pull_request={ "isDraft": False, "number": 2, }, - "Checking 0 of 1 fetched", - ) - - run( - { + check="Checking 0 of 1 fetched", + ), + week_old=dict( + pull_request={ "number": 123, "url": "https://github.com/apache/tvm/pull/123", "body": "cc @someone", @@ -667,12 +546,11 @@ def all_time_keys(time): **all_time_keys("2022-01-18T17:54:19Z"), "comments": {"nodes": []}, }, - "Pinging reviewers ['someone'] on https://github.com/apache/tvm/pull/123", - ) - + check="Pinging reviewers ['someone'] on https://github.com/apache/tvm/pull/123", + ), # Check allowlist functionality - run( - { + allowlist=dict( + pull_request={ "number": 123, "url": "https://github.com/apache/tvm/pull/123", "body": "cc @someone", @@ -686,12 +564,11 @@ def all_time_keys(time): ] }, }, - "Checking 0 of 1 fetched", - ) - + check="Checking 0 of 1 fetched", + ), # Old comment, ping - run( - { + old_comment=dict( + pull_request={ "number": 123, "url": "https://github.com/apache/tvm/pull/123", "body": "cc @someone", @@ -708,12 +585,11 @@ def all_time_keys(time): ] }, }, - "Pinging reviewers ['someone'] on https://github.com/apache/tvm/pull/123", - ) - + check="Pinging reviewers ['someone'] on https://github.com/apache/tvm/pull/123", + ), # New comment, don't ping - run( - { + new_comment=dict( + pull_request={ "number": 123, "url": "https://github.com/apache/tvm/pull/123", "body": "cc @someone", @@ -727,8 +603,45 @@ def all_time_keys(time): ] }, }, - "Not pinging PR 123", + check="Not pinging PR 123", + ), +) +def test_ping_reviewers(tmpdir_factory, pull_request, check): + """ + Test that reviewers are messaged after a time period of inactivity + """ + reviewers_script = REPO_ROOT / "ci" / "scripts" / "ping_reviewers.py" + + git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) + + data = { + "data": { + "repository": { + "pullRequests": { + "nodes": [pull_request], + "edges": [], + } + } + } + } + proc = run_script( + [ + reviewers_script, + "--dry-run", + "--wait-time-minutes", + "1", + "--cutoff-pr-number", + "5", + "--allowlist", + "user", + "--pr-json", + json.dumps(data), + "--now", + "2022-01-26T17:54:19Z", + ], + cwd=git.cwd, ) + assert_in(check, proc.stdout) def assert_in(needle: str, haystack: str): @@ -740,69 +653,8 @@ def assert_in(needle: str, haystack: str): @tvm.testing.skip_if_wheel_test -def test_github_tag_teams(tmpdir_factory): - """ - Check that individuals are tagged from team headers - """ - tag_script = REPO_ROOT / "ci" / "scripts" / "github_tag_teams.py" - - def run(source_type, data, check): - git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) - git.run("init") - git.run("checkout", "-b", "main") - git.run("remote", "add", "origin", "https://github.com/apache/tvm.git") - - issue_body = """ - some text - [temporary] opt-in: @person5 - - - something: @person1 @person2 - - something3: @person1 @person2 @SOME1-ONE- - - something else @person1 @person2 - - something else2: @person1 @person2 - - something-else @person1 @person2 - """ - comment1 = """ - another thing: @person3 - another-thing @person3 - """ - comment2 = """ - something @person4 - @person5 - """ - teams = { - "data": { - "repository": { - "issue": { - "body": issue_body, - "comments": {"nodes": [{"body": comment1}, {"body": comment2}]}, - } - } - } - } - env = { - source_type: json.dumps(data), - } - proc = subprocess.run( - [ - str(tag_script), - "--dry-run", - "--team-issue-json", - json.dumps(teams), - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - encoding="utf-8", - cwd=git.cwd, - env=env, - check=False, - ) - if proc.returncode != 0: - raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}") - - assert_in(check, proc.stdout) - - run( +@parameterize_named( + no_cc=dict( source_type="ISSUE", data={ "title": "A title", @@ -818,9 +670,8 @@ def run(source_type, data, check): ), }, check="No one to cc, exiting", - ) - - run( + ), + no_additional_cc=dict( source_type="ISSUE", data={ "title": "A title", @@ -838,9 +689,8 @@ def run(source_type, data, check): ), }, check="No one to cc, exiting", - ) - - run( + ), + cc_update=dict( source_type="ISSUE", data={ "title": "A title", @@ -858,9 +708,8 @@ def run(source_type, data, check): }, check="would have updated issues/1234 with {'body': " "'\\nhello\\n\\nsomething\\n\\ncc @person1 @person2 @person4'}", - ) - - run( + ), + already_cced=dict( source_type="ISSUE", data={ "title": "A title", @@ -877,9 +726,8 @@ def run(source_type, data, check): ), }, check="No one to cc, exiting", - ) - - run( + ), + not_already_cced=dict( source_type="ISSUE", data={ "title": "[something] A title", @@ -897,9 +745,8 @@ def run(source_type, data, check): }, check="would have updated issues/1234 with {'body': " "'\\nhello\\n\\nsomething\\n\\ncc @person1 @person2 @person4'}", - ) - - run( + ), + no_new_ccs=dict( source_type="ISSUE", data={ "title": "[something] A title", @@ -916,9 +763,8 @@ def run(source_type, data, check): ), }, check="No one to cc, exiting", - ) - - run( + ), + mismatching_tags=dict( source_type="PR", data={ "title": "[something] A title", @@ -936,9 +782,8 @@ def run(source_type, data, check): ), }, check="No one to cc, exiting", - ) - - run( + ), + draft_pr=dict( source_type="PR", data={ "title": "[something] A title", @@ -956,9 +801,8 @@ def run(source_type, data, check): ), }, check="Terminating since 1234 is a draft", - ) - - run( + ), + edit_inplace=dict( source_type="ISSUE", data={ "title": "[something] A title", @@ -974,9 +818,8 @@ def run(source_type, data, check): check="would have updated issues/1234 with {'body': '`mold` and `lld` can be a much" " faster alternative to `ld` from gcc. We should modify our CMakeLists.txt to " "detect and use these when possible. cc @person1\\n\\ncc @person2 @person4'}", - ) - - run( + ), + edit_out_of_place=dict( source_type="ISSUE", data={ "title": "[something3] A title", @@ -989,9 +832,8 @@ def run(source_type, data, check): }, check="Dry run, would have updated issues/1234 with" " {'body': '@person2 @SOME1-ONE-\\n\\ncc @person1'}", - ) - - run( + ), + atted_but_not_cced=dict( source_type="ISSUE", data={ "title": "[] A title", @@ -1003,12 +845,64 @@ def run(source_type, data, check): "body": "@person2 @SOME1-ONE-", }, check="No one to cc, exiting", + ), +) +def test_github_tag_teams(tmpdir_factory, source_type, data, check): + """ + Check that individuals are tagged from team headers + """ + tag_script = REPO_ROOT / "ci" / "scripts" / "github_tag_teams.py" + + git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) + + issue_body = """ + some text + [temporary] opt-in: @person5 + + - something: @person1 @person2 + - something3: @person1 @person2 @SOME1-ONE- + - something else @person1 @person2 + - something else2: @person1 @person2 + - something-else @person1 @person2 + """ + comment1 = """ + another thing: @person3 + another-thing @person3 + """ + comment2 = """ + something @person4 + @person5 + """ + teams = { + "data": { + "repository": { + "issue": { + "body": issue_body, + "comments": {"nodes": [{"body": comment1}, {"body": comment2}]}, + } + } + } + } + env = { + source_type: json.dumps(data), + } + proc = run_script( + [ + tag_script, + "--dry-run", + "--team-issue-json", + json.dumps(teams), + ], + cwd=git.cwd, + env=env, ) + assert_in(check, proc.stdout) + @tvm.testing.skip_if_wheel_test @parameterize_named( - dict( + same_tags=dict( tlcpackstaging_body={ "results": [ { @@ -1028,7 +922,7 @@ def run(source_type, data, check): expected="Tag names were the same, no update needed", expected_images=[], ), - dict( + staging_update=dict( tlcpackstaging_body={ "results": [ { @@ -1054,7 +948,7 @@ def run(source_type, data, check): "ci_arm = 'tlcpack/ci-arm:456-456-abc'", ], ), - dict( + tlcpack_update=dict( tlcpackstaging_body={ "results": [ { @@ -1084,22 +978,19 @@ def test_open_docker_update_pr( tag_script = REPO_ROOT / "ci" / "scripts" / "open_docker_update_pr.py" git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) - git.run("init") git.run("config", "user.name", "ci") git.run("config", "user.email", "email@example.com") - git.run("checkout", "-b", "main") - git.run("remote", "add", "origin", "https://github.com/apache/tvm.git") images = [ - "ci_lint", - "ci_gpu", - "ci_cpu", - "ci_minimal", - "ci_wasm", - "ci_i386", - "ci_cortexm", "ci_arm", + "ci_cortexm", + "ci_cpu", + "ci_gpu", "ci_hexagon", + "ci_i386", + "ci_lint", + "ci_minimal", "ci_riscv", + "ci_wasm", ] docker_data = {} @@ -1107,52 +998,43 @@ def test_open_docker_update_pr( docker_data[f"repositories/tlcpackstaging/{image}/tags"] = tlcpackstaging_body docker_data[f"repositories/tlcpack/{image.replace('_', '-')}/tags"] = tlcpack_body - proc = subprocess.run( + proc = run_script( [ - str(tag_script), + tag_script, "--dry-run", "--testing-docker-data", json.dumps(docker_data), ], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - encoding="utf-8", cwd=git.cwd, env={"GITHUB_TOKEN": "1234"}, - check=False, + stderr=subprocess.STDOUT, ) for line in expected_images: if line not in proc.stdout: raise RuntimeError(f"Missing line {line} in output:\n{proc.stdout}") - if proc.returncode != 0: - raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}") - assert_in(expected, proc.stdout) -@pytest.mark.parametrize( - "images,expected", - [ - ( - ["ci_arm=tlcpack/ci-arm:abc-abc-123", "ci_lint=tlcpack/ci-lint:abc-abc-234"], - { - "ci_arm": "tlcpack/ci-arm:abc-abc-123", - "ci_lint": "tlcpack/ci-lint:abc-abc-234", - }, - ), - ( - ["ci_arm2=tlcpack/ci-arm2:abc-abc-123"], - { - "ci_arm2": "tlcpackstaging/ci_arm2:abc-abc-123", - }, - ), - ], +@parameterize_named( + use_tlcpack=dict( + images=["ci_arm=tlcpack/ci-arm:abc-abc-123", "ci_lint=tlcpack/ci-lint:abc-abc-234"], + expected={ + "ci_arm": "tlcpack/ci-arm:abc-abc-123", + "ci_lint": "tlcpack/ci-lint:abc-abc-234", + }, + ), + use_staging=dict( + images=["ci_arm2=tlcpack/ci-arm2:abc-abc-123"], + expected={ + "ci_arm2": "tlcpackstaging/ci_arm2:abc-abc-123", + }, + ), ) def test_determine_docker_images(tmpdir_factory, images, expected): """Test script to decide whether to use tlcpack or tlcpackstaging for images""" - tag_script = REPO_ROOT / "ci" / "scripts" / "determine_docker_images.py" + script = REPO_ROOT / "ci" / "scripts" / "determine_docker_images.py" git_dir = tmpdir_factory.mktemp("tmp_git_dir") @@ -1161,23 +1043,17 @@ def test_determine_docker_images(tmpdir_factory, images, expected): "repositories/tlcpack/ci-lint/tags/abc-abc-234": {}, } - proc = subprocess.run( + run_script( [ - str(tag_script), + script, "--testing-docker-data", json.dumps(docker_data), "--base-dir", git_dir, ] + images, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - encoding="utf-8", cwd=git_dir, - check=False, ) - if proc.returncode != 0: - raise RuntimeError(f"Failed to run script:\n{proc.stdout}") for expected_filename, expected_image in expected.items(): with open(Path(git_dir) / expected_filename) as f: @@ -1186,34 +1062,28 @@ def test_determine_docker_images(tmpdir_factory, images, expected): assert actual_image == expected_image -@pytest.mark.parametrize( - "changed_files,name,check,expected_code", - [ - d.values() - for d in [ - dict( - changed_files=[], - name="abc", - check="Image abc is not using new naming scheme", - expected_code=1, - ), - dict( - changed_files=[], name="123-123-abc", check="No extant hash found", expected_code=1 - ), - dict( - changed_files=[["test.txt"]], - name=None, - check="Did not find changes, no rebuild necessary", - expected_code=0, - ), - dict( - changed_files=[["test.txt"], ["docker/test.txt"]], - name=None, - check="Found docker changes", - expected_code=2, - ), - ] - ], +@parameterize_named( + invalid_name=dict( + changed_files=[], + name="abc", + check="Image abc is not using new naming scheme", + expected_code=1, + ), + no_hash=dict( + changed_files=[], name="123-123-abc", check="No extant hash found", expected_code=1 + ), + no_changes=dict( + changed_files=[["test.txt"]], + name=None, + check="Did not find changes, no rebuild necessary", + expected_code=0, + ), + docker_changes=dict( + changed_files=[["test.txt"], ["docker/test.txt"]], + name=None, + check="Found docker changes", + expected_code=2, + ), ) def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expected_code): """ @@ -1222,11 +1092,8 @@ def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expec tag_script = REPO_ROOT / "ci" / "scripts" / "should_rebuild_docker.py" git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) - git.run("init") git.run("config", "user.name", "ci") git.run("config", "user.email", "email@example.com") - git.run("checkout", "-b", "main") - git.run("remote", "add", "origin", "https://github.com/apache/tvm.git") git_path = Path(git.cwd) for i, commits in enumerate(changed_files): @@ -1262,15 +1129,13 @@ def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expec }, } - proc = subprocess.run( + proc = run_script( [ - str(tag_script), + tag_script, "--testing-docker-data", json.dumps(docker_data), ], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - encoding="utf-8", cwd=git.cwd, check=False, ) diff --git a/tests/python/ci/test_tvmbot.py b/tests/python/ci/test_tvmbot.py index 2c7a0eaec0d4..ceabd46a9b03 100644 --- a/tests/python/ci/test_tvmbot.py +++ b/tests/python/ci/test_tvmbot.py @@ -18,13 +18,12 @@ Test the @tvm-bot merge code """ -import subprocess import json from pathlib import Path +from typing import Dict, Any -import pytest import tvm -from .test_utils import REPO_ROOT, TempGit +from .test_utils import REPO_ROOT, TempGit, run_script SUCCESS_EXPECTED_OUTPUT = """ @@ -37,167 +36,244 @@ """.strip() -TEST_DATA = { - "successful-merge": { - "number": 10786, - "filename": "pr10786-merges.json", - "expected": SUCCESS_EXPECTED_OUTPUT, - "comment": "@tvm-bot merge", - "user": "abc", - "detail": "Everything is fine so this PR will merge", - }, - "no-request": { - "number": 10786, - "filename": "pr10786-nottriggered.json", - "expected": "Command 'do something else' did not match anything", - "comment": "@tvm-bot do something else", - "user": "abc", - "detail": "A PR for which the mergebot runs but no merge is requested", - }, - "bad-ci": { - "number": 10786, - "filename": "pr10786-badci.json", - "expected": "Cannot merge, these CI jobs are not successful on", - "comment": "@tvm-bot merge", - "user": "abc", - "detail": "A PR which failed CI and cannot merge", - }, - "old-review": { - "number": 10786, - "filename": "pr10786-oldreview.json", - "expected": "Cannot merge, did not find any approving reviews", - "comment": "@tvm-bot merge", - "user": "abc", - "detail": "A PR with passing CI and approving reviews on an old commit so it cannot merge", - }, - "missing-job": { - "number": 10786, - "filename": "pr10786-missing-job.json", - "expected": "Cannot merge, missing expected jobs", - "comment": "@tvm-bot merge", - "user": "abc", - "detail": "PR missing an expected CI job and cannot merge", - }, - "invalid-author": { - "number": 10786, - "filename": "pr10786-invalid-author.json", - "expected": "Failed auth check 'collaborators', quitting", - "comment": "@tvm-bot merge", - "user": "not-abc", - "detail": "Merge requester is not a committer and cannot merge", - }, - "unauthorized-comment": { - "number": 11244, - "filename": "pr11244-unauthorized-comment.json", - "expected": "Failed auth check 'collaborators'", - "comment": "@tvm-bot merge", - "user": "not-abc2", - "detail": "Check that a merge comment not from a CONTRIBUTOR is rejected", - }, - "no-review": { - "number": 11267, - "filename": "pr11267-no-review.json", - "expected": "Cannot merge, did not find any approving reviews from users with write access", - "comment": "@tvm-bot merge", - "user": "abc", - "detail": "Check that a merge request without any reviews is rejected", - }, - "changes-requested": { - "number": 10786, - "filename": "pr10786-changes-requested.json", - "expected": "Cannot merge, found [this review]", - "comment": "@tvm-bot merge", - "user": "abc", - "detail": "Check that a merge request with a 'Changes Requested' review is rejected", - }, - "co-authors": { - "number": 10786, - "filename": "pr10786-co-authors.json", - "expected": "Co-authored-by: Some One ", - "comment": "@tvm-bot merge", - "user": "abc", - "detail": "Check that a merge request with co-authors generates the correct commit message", - }, - "rerun-ci": { - "number": 11442, - "filename": "pr11442-rerun-ci.json", - "expected": "Rerunning ci with", - "comment": "@tvm-bot rerun", - "user": "abc", - "detail": "Start a new CI job", - }, - "ignore-jobs": { - "number": 10786, - "filename": "pr10786-ignore-jobs.json", - "expected": "Dry run, would have merged", - "comment": "@tvm-bot merge", - "user": "abc", - "detail": "Ignore GitHub Actions jobs that don't start with CI / ", - }, -} +class _TvmBotTest: + NUMBER = 10786 + + def preprocess_data(self, data: Dict[str, Any]): + """ + Used to pre-process PR data before running the test. Override as + necessary to edit data for specific test cases. + """ + return data + + @tvm.testing.skip_if_wheel_test + def test(self, tmpdir_factory): + """ + Run the tvm-bot script using the data from preprocess_data + """ + mergebot_script = REPO_ROOT / "ci" / "scripts" / "github_tvmbot.py" + test_json_dir = Path(__file__).resolve().parent / "sample_prs" + with open(test_json_dir / f"pr{self.NUMBER}.json") as f: + test_data = json.load(f) + + # Update testing data with replacements / additions + test_data = self.preprocess_data(test_data) + + git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) + + comment = { + "body": self.COMMENT, + "id": 123, + "user": { + "login": self.USER, + }, + } + allowed_users = [{"login": "abc"}, {"login": "other-abc"}] + + proc = run_script( + [ + mergebot_script, + "--pr", + self.NUMBER, + "--dry-run", + "--run-url", + "https://example.com", + "--testing-pr-json", + json.dumps(test_data), + "--testing-collaborators-json", + json.dumps(allowed_users), + "--testing-mentionable-users-json", + json.dumps(allowed_users), + "--trigger-comment-json", + json.dumps(comment), + ], + env={ + "TVM_BOT_JENKINS_TOKEN": "123", + "GH_ACTIONS_TOKEN": "123", + }, + cwd=git.cwd, + ) + + if self.EXPECTED not in proc.stderr: + raise RuntimeError(f"{proc.stderr}\ndid not contain\n{self.EXPECTED}") + + +class TestNoRequest(_TvmBotTest): + """ + A PR for which the mergebot runs but no merge is requested + """ + + COMMENT = "@tvm-bot do something else" + USER = "abc" + EXPECTED = "Command 'do something else' did not match anything" + + def preprocess_data(self, data: Dict[str, Any]): + data["reviews"]["nodes"][0]["body"] = "nothing" + return data + + +class TestSuccessfulMerge(_TvmBotTest): + """ + Everything is fine so this PR will merge + """ + + COMMENT = "@tvm-bot merge" + USER = "abc" + EXPECTED = SUCCESS_EXPECTED_OUTPUT + + +class TestBadCI(_TvmBotTest): + """ + A PR which failed CI and cannot merge + """ + + COMMENT = "@tvm-bot merge" + USER = "abc" + EXPECTED = "Cannot merge, these CI jobs are not successful on" + + def preprocess_data(self, data: Dict[str, Any]): + # Mark the Jenkins build as failed + contexts = data["commits"]["nodes"][0]["commit"]["statusCheckRollup"]["contexts"]["nodes"] + for context in contexts: + if "context" in context and context["context"] == "tvm-ci/pr-head": + context["state"] = "FAILED" + return data + + +class TestOldReview(_TvmBotTest): + """ + A PR with passing CI and approving reviews on an old commit so it cannot merge + """ + + COMMENT = "@tvm-bot merge" + USER = "abc" + EXPECTED = "Cannot merge, did not find any approving reviews" + + def preprocess_data(self, data: Dict[str, Any]) -> Dict[str, Any]: + data["reviews"]["nodes"][0]["commit"]["oid"] = "abc12345" + return data + + +class TestMissingJob(_TvmBotTest): + """ + PR missing an expected CI job and cannot merge + """ + + COMMENT = "@tvm-bot merge" + USER = "abc" + EXPECTED = "Cannot merge, missing expected jobs" + + def preprocess_data(self, data: Dict[str, Any]) -> Dict[str, Any]: + contexts = data["commits"]["nodes"][0]["commit"]["statusCheckRollup"]["contexts"]["nodes"] + for context in contexts: + if "context" in context and context["context"] == "tvm-ci/pr-head": + context["context"] = "something" + return data + + +class TestInvalidAuthor(_TvmBotTest): + """ + Merge requester is not a committer and cannot merge + """ + + COMMENT = "@tvm-bot merge" + USER = "not-abc" + EXPECTED = "Failed auth check 'collaborators', quitting" -@tvm.testing.skip_if_wheel_test -@pytest.mark.parametrize( - ["number", "filename", "expected", "comment", "user", "detail"], - [tuple(d.values()) for d in TEST_DATA.values()], - ids=TEST_DATA.keys(), -) -def test_tvmbot(tmpdir_factory, number, filename, expected, comment, user, detail): - """ - Test the mergebot test cases - """ - mergebot_script = REPO_ROOT / "ci" / "scripts" / "github_tvmbot.py" - test_json_dir = Path(__file__).resolve().parent / "sample_prs" - - git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) - git.run("init", stderr=subprocess.PIPE, stdout=subprocess.PIPE) - git.run("checkout", "-b", "main", stderr=subprocess.PIPE, stdout=subprocess.PIPE) - git.run("remote", "add", "origin", "https://github.com/apache/tvm.git") - with open(test_json_dir / filename) as f: - test_data = json.load(f) - - comment = { - "body": comment, - "id": 123, - "user": { - "login": user, - }, - } - allowed_users = [{"login": "abc"}] - - proc = subprocess.run( - [ - str(mergebot_script), - "--pr", - str(number), - "--dry-run", - "--run-url", - "https://example.com", - "--testing-pr-json", - json.dumps(test_data), - "--testing-collaborators-json", - json.dumps(allowed_users), - "--testing-mentionable-users-json", - json.dumps(allowed_users), - "--trigger-comment-json", - json.dumps(comment), - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - encoding="utf-8", - env={ - "TVM_BOT_JENKINS_TOKEN": "123", - "GH_ACTIONS_TOKEN": "123", - }, - cwd=git.cwd, - check=False, - ) - if proc.returncode != 0: - raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}") - - if expected not in proc.stderr: - raise RuntimeError(f"{proc.stderr}\ndid not contain\n{expected}") +class TestUnauthorizedComment(_TvmBotTest): + """ + Check that a merge comment not from a CONTRIBUTOR is rejected + """ + + COMMENT = "@tvm-bot merge" + USER = "not-abc2" + EXPECTED = "Failed auth check 'collaborators'" + + +class TestNoReview(_TvmBotTest): + """ + Check that a merge request without any reviews is rejected + """ + + COMMENT = "@tvm-bot merge" + USER = "abc" + EXPECTED = "Cannot merge, did not find any approving reviews from users with write access" + + def preprocess_data(self, data: Dict[str, Any]) -> Dict[str, Any]: + data["reviews"]["nodes"] = [] + return data + + +class TestChangesRequested(_TvmBotTest): + """ + Check that a merge request with a 'Changes Requested' review is rejected + """ + + COMMENT = "@tvm-bot merge" + USER = "abc" + EXPECTED = "Cannot merge, found [this review]" + + def preprocess_data(self, data: Dict[str, Any]) -> Dict[str, Any]: + data["reviews"]["nodes"][0]["state"] = "CHANGES_REQUESTED" + data["reviews"]["nodes"][0]["url"] = "http://example.com" + return data + + +class TestCoAuthors(_TvmBotTest): + """ + Check that a merge request with co-authors generates the correct commit message + """ + + COMMENT = "@tvm-bot merge" + USER = "abc" + EXPECTED = "Co-authored-by: Some One " + + def preprocess_data(self, data: Dict[str, Any]) -> Dict[str, Any]: + data["authorCommits"]["nodes"][0]["commit"]["authors"]["nodes"].append( + {"name": "Some One", "email": "someone@email.com"} + ) + return data + + +class TestRerunCI(_TvmBotTest): + """ + Start a new CI job + """ + + COMMENT = "@tvm-bot rerun" + USER = "abc" + EXPECTED = "Rerunning ci with" + + +class TestRerunPermissions(_TvmBotTest): + """ + Start a new CI job as an unauthorized user + """ + + COMMENT = "@tvm-bot rerun" + USER = "someone" + EXPECTED = "Failed auth check 'metionable_users', quitting" + + +class TestRerunNonAuthor(_TvmBotTest): + """ + Start a new CI job as a mentionable user + """ + + COMMENT = "@tvm-bot rerun" + USER = "other-abc" + EXPECTED = "Passed auth check 'metionable_users', continuing" + + +class TestIgnoreJobs(_TvmBotTest): + """ + Ignore GitHub Actions jobs that don't start with CI / + """ + + COMMENT = "@tvm-bot merge" + USER = "abc" + EXPECTED = "Dry run, would have merged" if __name__ == "__main__": diff --git a/tests/python/ci/test_utils.py b/tests/python/ci/test_utils.py index 513601aa1b46..4a0f2710e74a 100644 --- a/tests/python/ci/test_utils.py +++ b/tests/python/ci/test_utils.py @@ -19,19 +19,28 @@ """ import subprocess import pathlib +from typing import List, Any REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent class TempGit: """ - A wrapper to run commands in a directory + A wrapper to run commands in a directory (specifically for use in CI tests) """ def __init__(self, cwd): self.cwd = cwd + # Jenkins git is too old and doesn't have 'git init --initial-branch', + # so init and checkout need to be separate steps + self.run("init", stderr=subprocess.PIPE, stdout=subprocess.PIPE) + self.run("checkout", "-b", "main", stderr=subprocess.PIPE) + self.run("remote", "add", "origin", "https://github.com/apache/tvm.git") def run(self, *args, **kwargs): + """ + Run a git command based on *args + """ proc = subprocess.run( ["git"] + list(args), encoding="utf-8", cwd=self.cwd, check=False, **kwargs ) @@ -39,3 +48,25 @@ def run(self, *args, **kwargs): raise RuntimeError(f"git command failed: '{args}'") return proc + + +def run_script(command: List[Any], check: bool = True, **kwargs): + """ + Wrapper to run a script and print its output if there was an error + """ + command = [str(c) for c in command] + kwargs_to_send = { + "stdout": subprocess.PIPE, + "stderr": subprocess.PIPE, + "encoding": "utf-8", + } + kwargs_to_send.update(kwargs) + proc = subprocess.run( + command, + check=False, + **kwargs_to_send, + ) + if check and proc.returncode != 0: + raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}") + + return proc From 0cbf3aa6e22e77a62256e35a9eef4dbe327b6fa0 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Fri, 2 Sep 2022 14:27:45 -0700 Subject: [PATCH 101/704] [AutoTVM][Testing] Add `tune_relay` scripts (#12685) Example: ```bash python -m tvm.autotvm.testing.tune_relay \ --workload bert_base \ --input-shape '[1,64]' \ --target "llvm" \ --num-trials 800 \ --rpc-host 192.168.6.66 \ --rpc-port 4445 \ --rpc-key 3090ti \ --work-dir /logs/autotvm-bert_base \ --cache-dir /cache-workloads \ --graph-tuner True \ --cpu-flush True \ --backend graph ``` --- python/tvm/autotvm/testing/__init__.py | 17 ++ python/tvm/autotvm/testing/tune_relay.py | 263 +++++++++++++++++++++++ 2 files changed, 280 insertions(+) create mode 100644 python/tvm/autotvm/testing/__init__.py create mode 100644 python/tvm/autotvm/testing/tune_relay.py diff --git a/python/tvm/autotvm/testing/__init__.py b/python/tvm/autotvm/testing/__init__.py new file mode 100644 index 000000000000..972d0cbaae5c --- /dev/null +++ b/python/tvm/autotvm/testing/__init__.py @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Testing utilities for autotvm""" diff --git a/python/tvm/autotvm/testing/tune_relay.py b/python/tvm/autotvm/testing/tune_relay.py new file mode 100644 index 000000000000..e4745963741f --- /dev/null +++ b/python/tvm/autotvm/testing/tune_relay.py @@ -0,0 +1,263 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-docstring +import argparse +import json +import os +import warnings +from distutils.util import strtobool + +import tvm +from tvm import autotvm +from tvm import meta_schedule as ms +from tvm import relay +from tvm.autotvm.graph_tuner import DPTuner +from tvm.autotvm.tuner import XGBTuner +from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc +from tvm.meta_schedule.testing.relay_workload import get_network +from tvm.meta_schedule.testing.tune_utils import create_timer, generate_input_data +from tvm.support import describe + + +def _parse_args(): + args = argparse.ArgumentParser() + args.add_argument( + "--workload", + type=str, + required=True, + help="The name of the workload to tune. Supported models: " + "https://github.com/apache/tvm/blob/main/python/tvm/meta_schedule/testing/relay_workload.py#L303-L322", # pylint: disable=line-too-long + ) + args.add_argument( + "--input-shape", + type=str, + required=True, + help="The input shape of the workload. Example: '[1, 3, 224, 224]'", + ) + args.add_argument( + "--target", + type=str, + required=True, + help="The target device to tune. " + "Example: 'aws/cpu/c5.9xlarge', 'nvidia/nvidia-v100', 'nvidia/geforce-rtx-3090'", + ) + args.add_argument( + "--num-trials", + type=int, + required=True, + help="The number of trials per kernel. Example: 800", + ) + args.add_argument( + "--rpc-host", + type=str, + required=True, + help="The host address of the RPC tracker. Example: 192.168.6.66", + ) + args.add_argument( + "--rpc-port", + type=int, + required=True, + help="The port of the RPC tracker. Example: 4445", + ) + args.add_argument( + "--rpc-key", + type=str, + required=True, + help="The key of the RPC tracker. Example: '3090ti'", + ) + args.add_argument( + "--work-dir", + type=str, + required=True, + help="The working directory to store the tuning logs. Example: '/tmp/tune_relay'", + ) + args.add_argument( + "--layout", + type=str, + default=None, + help="The layout of the workload. Example: 'NCHW', 'NHWC'", + ) + args.add_argument( + "--cache-dir", + type=str, + default=None, + ) + args.add_argument( + "--number", + type=int, + default=3, + ) + args.add_argument( + "--repeat", + type=int, + default=1, + ) + args.add_argument( + "--min-repeat-ms", + type=int, + default=100, + ) + args.add_argument( + "--cpu-flush", + type=lambda x: bool(strtobool(x)), + help="example: True / False", + required=True, + ) + args.add_argument( + "--graph-tuner", + type=lambda x: bool(strtobool(x)), + help="example: True / False", + required=True, + ) + args.add_argument( + "--backend", + type=str, + choices=["graph", "vm"], + help="example: graph / vm", + required=True, + ) + parsed = args.parse_args() + parsed.target = tvm.target.Target(parsed.target) + parsed.input_shape = json.loads(parsed.input_shape) + parsed.rpc_config = ms.runner.RPCConfig( + tracker_host=parsed.rpc_host, + tracker_port=parsed.rpc_port, + tracker_key=parsed.rpc_key, + session_timeout_sec=600, + ) + if ARGS.target.kind.name != "llvm" and ARGS.graph_tuner: + raise ValueError("GraphTuner only supports llvm target") + if ARGS.target.kind.name != "llvm" and ARGS.cpu_flush: + raise ValueError("cpu_flush only supports llvm target") + if ARGS.target.kind.name == "llvm" and not ARGS.cpu_flush: + warnings.warn("cpu_flush is not enabled for llvm target") + return parsed + + +ARGS = _parse_args() + + +def main(): + log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json") + graph_opt_sch_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}_graph_opt.log") + measure_option = autotvm.measure_option( + builder=autotvm.LocalBuilder(), + runner=autotvm.RPCRunner( + key=ARGS.rpc_key, + host=ARGS.rpc_host, + port=ARGS.rpc_port, + number=ARGS.number, + repeat=ARGS.repeat, + min_repeat_ms=ARGS.min_repeat_ms, + enable_cpu_cache_flush=ARGS.cpu_flush, + ), + ) + describe() + print(f"Workload: {ARGS.workload}") + mod, params, (input_name, input_shape, input_dtype) = get_network( + ARGS.workload, + ARGS.input_shape, + layout=ARGS.layout, + cache_dir=ARGS.cache_dir, + ) + input_info = [ + { + "name": input_name, + "shape": input_shape, + "dtype": input_dtype, + }, + ] + input_data = { + item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in input_info + } + for item in input_info: + print(f" input_name : {item['name']}") + print(f" input_shape: {item['shape']}") + print(f" input_dtype: {item['dtype']}") + + with ms.Profiler() as profiler: + with ms.Profiler.timeit("TaskExtraction"): + # extract workloads from relay program + tasks = autotvm.task.extract_from_program( + mod["main"], + target=ARGS.target, + params=params, + ops=( + relay.op.get("nn.conv2d"), + relay.op.get("nn.conv3d"), + relay.op.get("nn.conv2d_transpose"), + relay.op.get("nn.dense"), + relay.op.get("nn.batch_matmul"), + ), + ) + for i, task in enumerate(tasks): + print(f"Task {i} {task.name}: {task}") + + with ms.Profiler.timeit("Tuning"): + if ARGS.num_trials > 0: + for i, task in enumerate(tasks): + prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) + tuner_obj = XGBTuner(task, loss_type="rank") + n_trial = min(len(task.config_space), ARGS.num_trials) + tuner_obj.tune( + n_trial=n_trial, + early_stopping=800, + measure_option=measure_option, + callbacks=[ + autotvm.callback.progress_bar(n_trial, prefix=prefix), + autotvm.callback.log_to_file(log_file), + ], + ) + if ARGS.graph_tuner: + executor = DPTuner( + graph=mod["main"], + input_shapes={input_name: input_shape}, + records=log_file, + target_ops=[ + relay.op.get("nn.conv2d"), + ], + target=ARGS.target, + ) + executor.benchmark_layout_transform(min_exec_num=1000) + executor.run() + executor.write_opt_sch2record_file(graph_opt_sch_file) + + relay_build = {"graph": relay.build, "vm": relay.vm.compile}[ARGS.backend] + with ms.Profiler.timeit("PostTuningCompilation"): + if ARGS.graph_tuner: + ctx = autotvm.apply_graph_best(graph_opt_sch_file) + else: + ctx = autotvm.apply_history_best(log_file) + with ctx: + print("compile...") + with tvm.transform.PassContext(opt_level=3): + lib = relay_build(mod, target=ARGS.target, params=params) + print("Tuning Time:") + print(profiler.table()) + + run_module_via_rpc( + rpc_config=ARGS.rpc_config, + lib=lib, + dev_type=ARGS.target.kind.name, + args=input_data, + continuation=create_timer(ARGS.backend), + backend=ARGS.backend, + ) + + +if __name__ == "__main__": + main() From 4ed6564f764eea10af360a3e4bfb904061f5cc32 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Fri, 2 Sep 2022 15:01:22 -0700 Subject: [PATCH 102/704] [ci] Add tests for PR linter (#12680) This adds some checks for the current usages of the PR linter and fixes the case where the script would error uncleanly when a PR body was `null`. --- ci/scripts/check_pr.py | 17 ++++++--------- tests/python/ci/test_ci.py | 43 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 11 deletions(-) mode change 100644 => 100755 ci/scripts/check_pr.py diff --git a/ci/scripts/check_pr.py b/ci/scripts/check_pr.py old mode 100644 new mode 100755 index 45d502c6a72e..9af5ec5580a3 --- a/ci/scripts/check_pr.py +++ b/ci/scripts/check_pr.py @@ -18,6 +18,7 @@ import argparse import re import os +import json import textwrap from dataclasses import dataclass from typing import Any, List, Callable @@ -108,10 +109,7 @@ def run_checks(checks: List[Check], s: str, name: str) -> bool: parser.add_argument("--pr", required=True) parser.add_argument("--remote", default="origin", help="ssh remote to parse") parser.add_argument( - "--pr-body", help="(testing) PR body to use instead of fetching from GitHub" - ) - parser.add_argument( - "--pr-title", help="(testing) PR title to use instead of fetching from GitHub" + "--pr-data", help="(testing) PR data to use instead of fetching from GitHub" ) args = parser.parse_args() @@ -121,20 +119,17 @@ def run_checks(checks: List[Check], s: str, name: str) -> bool: print(f"PR was not a number: {args.pr}") exit(0) - if args.pr_body: - body = args.pr_body - title = args.pr_title + if args.pr_data: + pr = json.loads(args.pr_data) else: remote = git(["config", "--get", f"remote.{args.remote}.url"]) user, repo = parse_remote(remote) github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo) pr = github.get(f"pulls/{args.pr}") - body = pr["body"] - title = pr["title"] - body = body.strip() - title = title.strip() + body = "" if pr["body"] is None else pr["body"].strip() + title = "" if pr["title"] is None else pr["title"].strip() title_passed = run_checks(checks=title_checks, s=title, name="PR title") print("") diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py index f2e686d1e582..79c72ce988c3 100644 --- a/tests/python/ci/test_ci.py +++ b/tests/python/ci/test_ci.py @@ -1144,5 +1144,48 @@ def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expec assert proc.returncode == expected_code +@parameterize_named( + passing=dict( + title="[something] a change", + body="something", + expected="All checks passed", + expected_code=0, + ), + period=dict( + title="[something] a change.", + body="something", + expected="trailing_period: FAILED", + expected_code=1, + ), + empty_body=dict( + title="[something] a change", + body=None, + expected="non_empty: FAILED", + expected_code=1, + ), +) +def test_pr_linter(title, body, expected, expected_code): + """ + Test the PR linter + """ + tag_script = REPO_ROOT / "ci" / "scripts" / "check_pr.py" + pr_data = { + "title": title, + "body": body, + } + proc = run_script( + [ + tag_script, + "--pr", + 1234, + "--pr-data", + json.dumps(pr_data), + ], + check=False, + ) + assert proc.returncode == expected_code + assert_in(expected, proc.stdout) + + if __name__ == "__main__": tvm.testing.main() From 2734d044a24bdfcdab1fb473d07b93f4ed6b64eb Mon Sep 17 00:00:00 2001 From: Alexey Voronov Date: Sat, 3 Sep 2022 01:42:59 +0300 Subject: [PATCH 103/704] [Adreno] Define memory_info for global.texture* (#12647) There are now many warnings in the tuning process about undefined memory information when using textures. A definition is required as textures* are tagged. --- include/tvm/target/target_info.h | 6 +++--- python/tvm/topi/adreno/utils.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/tvm/target/target_info.h b/include/tvm/target/target_info.h index 1de15a5bd526..946161f905f3 100644 --- a/include/tvm/target/target_info.h +++ b/include/tvm/target/target_info.h @@ -37,11 +37,11 @@ namespace tvm { class MemoryInfoNode : public Object { public: /*! \brief The addressable unit */ - int unit_bits; + int64_t unit_bits; /*! \brief Maximum number of bits supported in the memory */ - int max_num_bits; + int64_t max_num_bits; /*! \brief maximum number of bits to be used in simd op */ - int max_simd_bits; + int64_t max_simd_bits; /*! * \brief head address of the buffer, if visible to CPU * This address can be None. diff --git a/python/tvm/topi/adreno/utils.py b/python/tvm/topi/adreno/utils.py index 6ad5271744b2..de0505af03d4 100644 --- a/python/tvm/topi/adreno/utils.py +++ b/python/tvm/topi/adreno/utils.py @@ -20,6 +20,7 @@ import tvm import numpy from tvm import te +from tvm._ffi.registry import register_func from tvm.topi.utils import simplify from tvm.topi import nn from tvm.autotvm.task.space import SplitEntity @@ -571,6 +572,19 @@ def get_texture_storage(shape): return "global.texture-weight" +@register_func("tvm.info.mem.global.texture") +@register_func("tvm.info.mem.global.texture-nhwc") +@register_func("tvm.info.mem.global.texture-weight") +def mem_info_global_texture_variants(): + return tvm.ir.make_node( + "MemoryInfo", + unit_bits=16, + max_num_bits=16384 * 16384 * 4 * 32, + max_simd_bits=4 * 32, + head_address=None, + ) + + def infer_tile_size(data, layout): """Compute the tile size for Winograd algorithm From 28cad58fd06f6fd395390f5a33c81acda4c27d12 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Fri, 2 Sep 2022 16:43:31 -0700 Subject: [PATCH 104/704] [Web][Emscripten] Update EMCC C++ standard to C++17 (#12693) As a follow-up to https://github.com/apache/tvm/pull/12337, updating the EMCC flags from `-std=c++14` to `-std=c++17`. --- web/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/Makefile b/web/Makefile index 34a1b8172484..d6adc94170fc 100644 --- a/web/Makefile +++ b/web/Makefile @@ -26,7 +26,7 @@ all: dist/wasm/tvmjs_runtime.wasm dist/wasm/tvmjs_runtime.wasi.js EMCC = emcc -EMCC_CFLAGS = $(INCLUDE_FLAGS) -O3 -std=c++14 -Wno-ignored-attributes --no-entry \ +EMCC_CFLAGS = $(INCLUDE_FLAGS) -O3 -std=c++17 -Wno-ignored-attributes --no-entry \ -s ALLOW_MEMORY_GROWTH=1 -s STANDALONE_WASM=1 -s ERROR_ON_UNDEFINED_SYMBOLS=0 EMCC_LDFLAGS = --pre-js emcc/preload.js From 5dcf62288b1d998df74ac36e48fcfe2424a0def8 Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Mon, 5 Sep 2022 09:27:03 +0100 Subject: [PATCH 105/704] [ETHOSN] Use pytest parameterization for integration tests (#12688) Using pytest parameterization helps identify the particular parameter combinations that are failing for a given test. Additionally, it can be useful when parallelizing the tests. This commit makes sure that "trials" have been replaced by parameterization as well as completing a general cleanup. --- .../python/contrib/test_ethosn/test_conv2d.py | 399 ++++++++++-------- .../test_ethosn/test_depth_to_space.py | 59 ++- .../test_ethosn/test_fullyconnected.py | 95 ++--- .../contrib/test_ethosn/test_pooling.py | 77 ++-- tests/python/contrib/test_ethosn/test_relu.py | 71 ++-- .../python/contrib/test_ethosn/test_resize.py | 42 +- .../contrib/test_ethosn/test_sigmoid.py | 82 ++-- .../python/contrib/test_ethosn/test_split.py | 59 ++- .../contrib/test_ethosn/test_topologies.py | 73 ++-- 9 files changed, 492 insertions(+), 465 deletions(-) diff --git a/tests/python/contrib/test_ethosn/test_conv2d.py b/tests/python/contrib/test_ethosn/test_conv2d.py index ffe66f0d2be2..4026f8267d72 100644 --- a/tests/python/contrib/test_ethosn/test_conv2d.py +++ b/tests/python/contrib/test_ethosn/test_conv2d.py @@ -18,11 +18,14 @@ """Arm(R) Ethos(TM)-N integration conv2d tests""" import math + import numpy as np import pytest + import tvm from tvm import relay from tvm.testing import requires_ethosn + from . import infrastructure as tei @@ -99,12 +102,12 @@ def _get_model( padding=p if pad in ("attr", "both") else (0, 0, 0, 0), out_dtype="int32", ) - b = tvm.nd.array( + bias_data = tvm.nd.array( np.random.randint( np.iinfo(dtype).min, high=np.iinfo(dtype).max + 1, size=(out_channels,), dtype="int32" ) ) - biasc = relay.const(b, "int32") + biasc = relay.const(bias_data, "int32") bias = relay.nn.bias_add(conv, biasc, axis=3) if isinstance(kernel_sc, tvm.runtime.ndarray.NDArray): req_input_sc = [sc * input_sc for sc in kernel_sc.numpy()] @@ -118,209 +121,222 @@ def _get_model( relay.const(output_zp, "int32"), # output zero point out_dtype=dtype, ) - params = {"w": weights_array, "b": b} + params = {"w": weights_array, "b": bias_data} return req, params @requires_ethosn -@pytest.mark.parametrize("depthwise", [False, True]) -@pytest.mark.parametrize("dtype", ["uint8", "int8"]) -def test_conv2d(dtype, depthwise): +@pytest.mark.parametrize( + "dtype,qnn_per_channel", [("uint8", False), ("int8", False), ("int8", True)] +) +@pytest.mark.parametrize("pad,stride", [("attr", (2, 2)), ("none", (2, 2)), ("op", (1, 1))]) +@pytest.mark.parametrize( + "shape,out_channels,kernel_size", + [ + [(1, 17, 20, 26), 4, (3, 1)], + [(1, 9, 20, 30), 7, (1, 5)], + [(1, 21, 21, 22), 8, (2, 2)], + ], +) +def test_conv2d( + dtype, + shape, + out_channels, + kernel_size, + pad, + stride, + qnn_per_channel, +): """Compare Conv2D output with TVM.""" - - trials = [ - [(1, 17, 20, 26), 4, 3, 1, "attr", (2, 2), (1, 1), False], - [(1, 30, 27, 30), 5, 5, 3, "none", (1, 1), (1, 1), False], - [(1, 30, 27, 30), 5, 5, 3, "none", (1, 1), (1, 1), dtype == "int8"], - [(1, 14, 28, 11), 6, 2, 2, "op", (2, 2), (1, 1), False], - [(1, 9, 20, 30), 7, 1, 5, "none", (1, 1), (1, 1), False], - [(1, 21, 21, 22), 8, 5, 1, "attr", (2, 2), (1, 1), False], - [(1, 21, 21, 22), 8, 5, 1, "attr", (2, 2), (1, 1), dtype == "int8"], - [(1, 21, 25, 29), 9, 2, 5, "op", (1, 1), (1, 1), False], - [(1, 21, 25, 29), 9, 2, 5, "op", (1, 1), (1, 1), dtype == "int8"], - [(1, 31, 28, 15), 10, 1, 2, "attr", (2, 2), (1, 1), False], - [(1, 21, 21, 8), 11, 3, 3, "none", (1, 1), (1, 1), False], - [(1, 5, 11, 6), 12, 5, 2, "op", (2, 2), (1, 1), False], - [(1, 12, 7, 18), 13, 1, 3, "op", (1, 1), (1, 1), False], - [(1, 24, 6, 26), 14, 3, 5, "none", (2, 2), (1, 1), False], - [(1, 19, 24, 16), 15, 2, 1, "attr", (1, 1), (1, 1), False], - ] - np.random.seed(0) - for shape, out_channels, kernel_h, kernel_w, pad, stride, dilation, qnn_per_channel in trials: - if depthwise: - out_channels = shape[3] - groups = out_channels - kernel_w = kernel_h - weight_format = "HWOI" - stride = (1, 1) if kernel_w == 1 else (2, 2) - else: - groups = 1 - weight_format = "HWIO" - outputs = [] - inputs = { - "a": tvm.nd.array( - np.random.randint( - np.iinfo(dtype).min, - np.iinfo(dtype).max + 1, - size=shape, - dtype=dtype, - ) - ), - } - input_zp = np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max) - input_sc = np.random.random() * 2 - if qnn_per_channel: - kernel_sc = tvm.nd.array( - np.random.uniform(low=0, high=2, size=(out_channels,)).astype(np.float32) + dilation = (1, 1) + groups = 1 + weight_format = "HWIO" + + outputs = [] + inputs = { + "a": tvm.nd.array( + np.random.randint( + np.iinfo(dtype).min, + np.iinfo(dtype).max + 1, + size=shape, + dtype=dtype, ) - else: - kernel_sc = np.random.random() * 2 - kernel_zp = ( - 0 if dtype == "int8" else np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max) - ) - output_zp, output_sc = tei.get_conv2d_qnn_params( - dtype, input_zp, input_sc, kernel_zp, kernel_sc, kernel_h, kernel_w, shape[3] - ) - model, params = _get_model( - shape, - kernel_h, - kernel_w, - input_zp, - input_sc, - kernel_zp, - kernel_sc, - output_zp, - output_sc, - pad, - stride, - dilation, - groups, - dtype, - out_channels, - weight_format, + ), + } + input_zp = np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max) + input_sc = np.random.random() * 2 + if qnn_per_channel: + kernel_sc = tvm.nd.array( + np.random.uniform(low=0, high=2, size=(out_channels,)).astype(np.float32) ) - for npu in [False, True]: - mod = tei.make_module(model, params) - outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu)) + else: + kernel_sc = np.random.random() * 2 + kernel_zp = ( + 0 if dtype == "int8" else np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max) + ) + output_zp, output_sc = tei.get_conv2d_qnn_params( + dtype, input_zp, input_sc, kernel_zp, kernel_sc, kernel_size[0], kernel_size[1], shape[3] + ) + model, params = _get_model( + shape, + kernel_size[0], + kernel_size[1], + input_zp, + input_sc, + kernel_zp, + kernel_sc, + output_zp, + output_sc, + pad, + stride, + dilation, + groups, + dtype, + out_channels, + weight_format, + ) + for npu in [False, True]: + mod = tei.make_module(model, params) + outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu)) - tei.verify(outputs, dtype, 1) + tei.verify(outputs, dtype, 1) @requires_ethosn -def test_conv2d_failure(): - """Check Conv2D error messages.""" +@pytest.mark.parametrize( + "dtype,qnn_per_channel", [("uint8", False), ("int8", False), ("int8", True)] +) +@pytest.mark.parametrize("pad,stride", [("attr", (2, 2)), ("none", (2, 2)), ("op", (1, 1))]) +@pytest.mark.parametrize( + "shape,kernel_size", + [ + [(1, 17, 20, 28), (3, 3)], + [(1, 9, 20, 30), (5, 5)], + [(1, 21, 21, 22), (2, 2)], + ], +) +def test_conv2d_depthwise( + dtype, + shape, + kernel_size, + pad, + stride, + qnn_per_channel, +): + """Compare Conv2D output with TVM.""" + np.random.seed(0) - trials = [ - ( - (1, 4, 4, 4), - 1, - 1, - 0, - 1024, - 0, - 1024, - 0, - 1, - "none", - (1, 1), - (1, 1), - 1, - "uint8", - 8, - "HWIO", - "Overall scale (of the input * weights / output) should be in the range (2^-32, 65536)", + dilation = (1, 1) + out_channels = shape[3] + groups = out_channels + weight_format = "HWOI" + + outputs = [] + inputs = { + "a": tvm.nd.array( + np.random.randint( + np.iinfo(dtype).min, + np.iinfo(dtype).max + 1, + size=shape, + dtype=dtype, + ) ), + } + input_zp = np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max) + input_sc = np.random.random() * 2 + if qnn_per_channel: + kernel_sc = tvm.nd.array( + np.random.uniform(low=0, high=2, size=(out_channels,)).astype(np.float32) + ) + else: + kernel_sc = np.random.random() * 2 + kernel_zp = ( + 0 if dtype == "int8" else np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max) + ) + output_zp, output_sc = tei.get_conv2d_qnn_params( + dtype, input_zp, input_sc, kernel_zp, kernel_sc, kernel_size[0], kernel_size[1], shape[3] + ) + model, params = _get_model( + shape, + kernel_size[0], + kernel_size[1], + input_zp, + input_sc, + kernel_zp, + kernel_sc, + output_zp, + output_sc, + pad, + stride, + dilation, + groups, + dtype, + out_channels, + weight_format, + ) + for npu in [False, True]: + mod = tei.make_module(model, params) + outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu)) + + tei.verify(outputs, dtype, 1) + + +@requires_ethosn +@pytest.mark.parametrize( + "shape,pad,stride,dilation,err_msg", + [ ( (1, 4, 4, 4), - 2, - 2, - 0, - 1, - 0, - 1, - 0, - 2, "both", (1, 1), (1, 1), - 1, - "uint8", - 8, - "HWIO", "both op and attr padding exist, must be either op/attr only or no padding", ), ( (1, 4, 4, 4), - 1, - 1, - 0, - 1, - 0, - 1, - 0, - 2, "none", (1, 1, 1), (1, 1), - 1, - "uint8", - 8, - "HWIO", "stride size=3, stride size must = 2", ), ( (1, 4, 4, 4), - 1, - 1, - 0, - 1, - 0, - 1, - 0, - 2, "none", (1, 1), (2, 1), - 1, - "uint8", - 8, - "HWIO", "dilation=[2, 1], dilation must = [1, 1]", ), ( (2, 4, 4, 4), - 1, - 1, - 0, - 1, - 0, - 1, - 0, - 2, "none", (1, 1), (1, 1), - 1, - "uint8", - 8, - "HWIO", "batch size=2, batch size must = 1", ), - ] - + ], +) +def test_conv2d_failure(shape, pad, stride, dilation, err_msg): + """Check Conv2D error messages.""" np.random.seed(0) - for ( + + kernel_size = (2, 2) + groups = 1 + dtype = "uint8" + out_channels = 8 + weight_format = "HWIO" + + model, _ = _get_model( shape, - kernel_h, - kernel_w, - input_zp, - input_sc, - kernel_zp, - kernel_sc, - output_zp, - output_sc, + kernel_size[0], + kernel_size[1], + 0, + 1, + 0, + 1, + 0, + 1, pad, stride, dilation, @@ -328,26 +344,43 @@ def test_conv2d_failure(): dtype, out_channels, weight_format, - err_msg, - ) in trials: - model, _ = _get_model( - shape, - kernel_h, - kernel_w, - input_zp, - input_sc, - kernel_zp, - kernel_sc, - output_zp, - output_sc, - pad, - stride, - dilation, - groups, - dtype, - out_channels, - weight_format, - ) - model = tei.make_ethosn_composite(model, "ethos-n.qnn_conv2d") - mod = tei.make_ethosn_partition(model) - tei.test_error(mod, {}, err_msg) + ) + model = tei.make_ethosn_composite(model, "ethos-n.qnn_conv2d") + mod = tei.make_ethosn_partition(model) + tei.test_error(mod, {}, err_msg) + + +@requires_ethosn +def test_conv2d_out_of_range_scale(): + """Check Conv2D scale out of range error.""" + np.random.seed(0) + + input_sc = 1024 + kernel_sc = 1024 + output_sc = 1 + + model, _ = _get_model( + (1, 4, 4, 4), + 1, + 1, + 0, + input_sc, + 0, + kernel_sc, + 0, + output_sc, + "none", + (1, 1), + (1, 1), + 1, + "uint8", + 8, + "HWIO", + ) + model = tei.make_ethosn_composite(model, "ethos-n.qnn_conv2d") + mod = tei.make_ethosn_partition(model) + + expected_err_msg = ( + "Overall scale (of the input * weights / output) should be in the range (2^-32, 65536)" + ) + tei.test_error(mod, {}, expected_err_msg) diff --git a/tests/python/contrib/test_ethosn/test_depth_to_space.py b/tests/python/contrib/test_ethosn/test_depth_to_space.py index c071fe00f212..732932d8f324 100644 --- a/tests/python/contrib/test_ethosn/test_depth_to_space.py +++ b/tests/python/contrib/test_ethosn/test_depth_to_space.py @@ -33,37 +33,35 @@ def _get_model(shape, block, dtype, layout): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) -def test_depth_to_space(dtype): - """Compare Depth To Space output with TVM.""" - - trials = [ +@pytest.mark.parametrize( + "shape", + [ (1, 16, 16, 16), (1, 64, 32, 16), - ] - + ], +) +def test_depth_to_space(dtype, shape): + """Compare Depth To Space output with TVM.""" np.random.seed(0) - for shape in trials: - inputs = { - "a": tvm.nd.array( - np.random.randint( - np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype - ) - ) - } - outputs = [] - for npu in [False, True]: - model = _get_model(shape, 2, dtype, "NHWC") - mod = tei.make_module(model, {}) - outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu)) - tei.verify(outputs, dtype, 1) + inputs = { + "a": tvm.nd.array( + np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype) + ) + } + outputs = [] + for npu in [False, True]: + model = _get_model(shape, 2, dtype, "NHWC") + mod = tei.make_module(model, {}) + outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu)) + tei.verify(outputs, dtype, 1) -@requires_ethosn -def test_depth_to_space_failure(): - """Check Depth To Space error messages.""" - trials = [ +@requires_ethosn +@pytest.mark.parametrize( + "shape,block,dtype,layout,err_msg", + [ ((2, 16, 16, 16), 2, "uint8", "NHWC", "batch size=2, batch size must = 1"), ( (1, 16, 16, 16), @@ -74,9 +72,10 @@ def test_depth_to_space_failure(): ), ((1, 16, 16, 16), 4, "uint8", "NHWC", "Only block size of 2 is supported"), ((1, 16, 16, 16), 2, "uint8", "NCHW", "Input layer must be NHWC or NHWCB"), - ] - - for shape, block, dtype, layout, err_msg in trials: - model = _get_model(shape, block, dtype, layout) - mod = tei.make_ethosn_partition(model) - tei.test_error(mod, {}, err_msg) + ], +) +def test_depth_to_space_failure(shape, block, dtype, layout, err_msg): + """Check Depth To Space error messages.""" + model = _get_model(shape, block, dtype, layout) + mod = tei.make_ethosn_partition(model) + tei.test_error(mod, {}, err_msg) diff --git a/tests/python/contrib/test_ethosn/test_fullyconnected.py b/tests/python/contrib/test_ethosn/test_fullyconnected.py index d5510bb79d2c..d38b2528c7bb 100644 --- a/tests/python/contrib/test_ethosn/test_fullyconnected.py +++ b/tests/python/contrib/test_ethosn/test_fullyconnected.py @@ -114,62 +114,63 @@ def test_fullyconnected(shape, out_channels, dtype, input_zp, input_sc, kernel_z @requires_ethosn -def test_fullyconnected_failure(): - """Check Fully Connected error messages.""" - - trials = [ - ( - (1, 64), - (1, 64), - 0, - 1024, - 0, - 1024, - 0, - 1, - "uint8", - "Overall scale (of the input * weights / output) should be in the range (2^-32, 65536)", - ), +@pytest.mark.parametrize( + "shape,weight_shape,err_msg", + [ ( (1, 1, 1, 64), (1, 64), - 0, - 1, - 0, - 1, - 0, - 1, - "uint8", "Weights tensor must have I dimension equal to the number" " of channels of the input tensor.;", ), - ((1024, 64), (1, 64), 0, 1, 0, 1, 0, 1, "uint8", "batch size=1024, batch size must = 1;"), - ] - + ((1024, 64), (1, 64), "batch size=1024, batch size must = 1;"), + ], +) +def test_fullyconnected_failure(shape, weight_shape, err_msg): + """Check Fully Connected error messages.""" np.random.seed(0) - for ( + + dtype = "uint8" + + model, _ = _get_model( shape, weight_shape, - input_zp, + 0, + 1, + 0, + 1, + 0, + 1, + dtype, + ) + model = tei.make_ethosn_composite(model, "ethos-n.qnn_fc") + mod = tei.make_ethosn_partition(model) + tei.test_error(mod, {}, err_msg) + + +@requires_ethosn +def test_fullyconnected_scale_out_of_range(): + """Check Fully Connected out of range scale error message.""" + np.random.seed(0) + + input_sc = 1024 + kernel_sc = 1024 + output_sc = 1 + + model, _ = _get_model( + (1, 64), + (1, 64), + 0, input_sc, - kernel_zp, + 0, kernel_sc, - output_zp, + 0, output_sc, - dtype, - err_msg, - ) in trials: - model, _ = _get_model( - shape, - weight_shape, - input_zp, - input_sc, - kernel_zp, - kernel_sc, - output_zp, - output_sc, - dtype, - ) - model = tei.make_ethosn_composite(model, "ethos-n.qnn_fc") - mod = tei.make_ethosn_partition(model) - tei.test_error(mod, {}, err_msg) + "uint8", + ) + model = tei.make_ethosn_composite(model, "ethos-n.qnn_fc") + mod = tei.make_ethosn_partition(model) + expected_error_msg = ( + "Overall scale (of the input * weights / output) should be in the range (2^-32, 65536)" + ) + tei.test_error(mod, {}, expected_error_msg) diff --git a/tests/python/contrib/test_ethosn/test_pooling.py b/tests/python/contrib/test_ethosn/test_pooling.py index e1c7358f71a1..1e0487d76778 100644 --- a/tests/python/contrib/test_ethosn/test_pooling.py +++ b/tests/python/contrib/test_ethosn/test_pooling.py @@ -38,91 +38,88 @@ def _get_model(shape, typef, sizes, strides, pads, layout, dtype): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) -def test_pooling(dtype): +@pytest.mark.parametrize( + "shape,typef,size,stride,pad", + [ + ((1, 8, 8, 8), relay.nn.max_pool2d, (2, 2), (2, 2), (0, 0, 0, 0)), + ((1, 9, 9, 9), relay.nn.max_pool2d, (3, 3), (2, 2), (0, 0, 0, 0)), + ((1, 8, 8, 8), relay.nn.avg_pool2d, (3, 3), (1, 1), (1, 1, 1, 1)), + ], +) +def test_pooling(dtype, shape, typef, size, stride, pad): """Compare Pooling output with TVM.""" + np.random.seed(0) - trials = [ - ((1, 8, 8, 8), relay.nn.max_pool2d, (2, 2), (2, 2), (0, 0, 0, 0), "NHWC"), - ((1, 9, 9, 9), relay.nn.max_pool2d, (3, 3), (2, 2), (0, 0, 0, 0), "NHWC"), - ((1, 8, 8, 8), relay.nn.avg_pool2d, (3, 3), (1, 1), (1, 1, 1, 1), "NHWC"), - ] + layout = "NHWC" - np.random.seed(0) - for shape, typef, size, stride, pad, layout in trials: - inputs = { - "a": tvm.nd.array( - np.random.randint( - low=np.iinfo(dtype).min, high=np.iinfo(dtype).max + 1, size=shape, dtype=dtype - ) - ), - } - outputs = [] - model = _get_model(shape, typef, size, stride, pad, layout, dtype) - for npu in [False, True]: - mod = tei.make_module(model, {}) - outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu)) + inputs = { + "a": tvm.nd.array( + np.random.randint( + low=np.iinfo(dtype).min, high=np.iinfo(dtype).max + 1, size=shape, dtype=dtype + ) + ), + } + outputs = [] + model = _get_model(shape, typef, size, stride, pad, layout, dtype) + for npu in [False, True]: + mod = tei.make_module(model, {}) + outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu)) - tei.verify(outputs, dtype, 1) + tei.verify(outputs, dtype, 1) @requires_ethosn -def test_pooling_failure(): - """Check Pooling error messages.""" - - trials = [ +@pytest.mark.parametrize( + "shape,size,stride,layout,dtype,err_msg", + [ ( (2, 8, 8, 8), - relay.nn.max_pool2d, (2, 2), (2, 2), - (0, 0, 0, 0), "NHWC", "uint8", "batch size=2, batch size must = 1", ), ( (1, 8, 8, 8), - relay.nn.max_pool2d, (2, 2), (2, 2), - (0, 0, 0, 0), "NHWC", "int16", "dtype='int16', dtype must be either uint8, int8 or int32", ), ( (1, 8, 8, 8), - relay.nn.max_pool2d, (2, 2), (2, 2), - (0, 0, 0, 0), "NCHW", "uint8", "data format=NCHW, data format must = NHWC", ), ( (1, 8, 8, 8), - relay.nn.max_pool2d, (2, 2), (2, 2, 2), - (0, 0, 0, 0), "NHWC", "uint8", "stride size=3, stride size must = 2", ), ( (1, 8, 8, 8), - relay.nn.max_pool2d, (2, 2, 2), (2, 2), - (0, 0, 0, 0), "NHWC", "uint8", "dimensions=3, dimensions must = 2", ), - ] + ], +) +def test_pooling_failure(shape, size, stride, layout, dtype, err_msg): + """Check Pooling error messages.""" + + typef = relay.nn.max_pool2d + pad = (0, 0, 0, 0) - for shape, typef, size, stride, pad, layout, dtype, err_msg in trials: - model = _get_model(shape, typef, size, stride, pad, layout, dtype) - mod = tei.make_ethosn_partition(model) - tei.test_error(mod, {}, err_msg) + model = _get_model(shape, typef, size, stride, pad, layout, dtype) + mod = tei.make_ethosn_partition(model) + tei.test_error(mod, {}, err_msg) diff --git a/tests/python/contrib/test_ethosn/test_relu.py b/tests/python/contrib/test_ethosn/test_relu.py index f56a1cd7ad3c..db1894931dd9 100644 --- a/tests/python/contrib/test_ethosn/test_relu.py +++ b/tests/python/contrib/test_ethosn/test_relu.py @@ -33,53 +33,50 @@ def _get_model(shape, dtype, a_min, a_max): @requires_ethosn -@pytest.mark.parametrize("dtype", ["uint8", "int8"]) -def test_relu(dtype): - """Compare Relu output with TVM.""" - - trials = [ +@pytest.mark.parametrize( + "shape,a_min,a_max,dtype", + [ ((1, 4, 4, 4), 65, 178, "uint8"), ((1, 8, 4, 2), 1, 254, "uint8"), - ((1, 16), 12, 76, "uint8"), - ((1, 4, 4, 4), 65, 125, "int8"), ((1, 8, 4, 2), -100, 100, "int8"), ((1, 16), -120, -20, "int8"), - ] - + ], +) +def test_relu(dtype, shape, a_min, a_max): + """Compare Relu output with TVM.""" np.random.seed(0) - for shape, a_min, a_max, trial_dtype in trials: - if trial_dtype == dtype: - inputs = { - "a": tvm.nd.array( - np.random.randint( - low=np.iinfo(dtype).min, - high=np.iinfo(dtype).max + 1, - size=shape, - dtype=dtype, - ) - ), - } - outputs = [] - for npu in [False, True]: - model = _get_model(inputs["a"].shape, dtype, a_min, a_max) - mod = tei.make_module(model, {}) - outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu)) - tei.verify(outputs, dtype, 1) + inputs = { + "a": tvm.nd.array( + np.random.randint( + low=np.iinfo(dtype).min, + high=np.iinfo(dtype).max + 1, + size=shape, + dtype=dtype, + ) + ), + } + outputs = [] + for npu in [False, True]: + model = _get_model(inputs["a"].shape, dtype, a_min, a_max) + mod = tei.make_module(model, {}) + outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu)) + tei.verify(outputs, dtype, 1) -@requires_ethosn -def test_relu_failure(): - """Check Relu error messages.""" - trials = [ +@requires_ethosn +@pytest.mark.parametrize( + "shape,dtype,a_min,a_max,err_msg", + [ ((1, 4, 4, 4, 4), "uint8", 65, 78, "dimensions=5, dimensions must be <= 4"), ((1, 8, 4, 2), "int16", 1, 254, "dtype='int16', dtype must be either uint8, int8 or int32"), ((1, 8, 4, 2), "uint8", 254, 1, "Relu has lower bound > upper bound"), ((2, 2, 2, 2), "uint8", 1, 63, "batch size=2, batch size must = 1; "), - ] - - for shape, dtype, a_min, a_max, err_msg in trials: - model = _get_model(shape, dtype, a_min, a_max) - mod = tei.make_ethosn_partition(model) - tei.test_error(mod, {}, err_msg) + ], +) +def test_relu_failure(shape, dtype, a_min, a_max, err_msg): + """Check Relu error messages.""" + model = _get_model(shape, dtype, a_min, a_max) + mod = tei.make_ethosn_partition(model) + tei.test_error(mod, {}, err_msg) diff --git a/tests/python/contrib/test_ethosn/test_resize.py b/tests/python/contrib/test_ethosn/test_resize.py index 2cc641e63b5c..b437ad1e545c 100644 --- a/tests/python/contrib/test_ethosn/test_resize.py +++ b/tests/python/contrib/test_ethosn/test_resize.py @@ -97,10 +97,9 @@ def test_resize(dtype, shape, size, coordinate_transformation_mode, rounding_met @requires_ethosn -def test_resize_failure(): - """Check Resize error messages.""" - - trials = [ +@pytest.mark.parametrize( + "size,err_msg", + [ ( (30, 20), "Requested height isn't supported", @@ -117,22 +116,25 @@ def test_resize_failure(): (20, 19), "Requested width and height must be both even or both odd", ), - ] + ], +) +def test_resize_failure(size, err_msg): + """Check Resize error messages.""" + dtype = "int8" zp_min = np.iinfo(dtype).min - for size, err_msg in trials: - model = _get_model( - shape=(1, 10, 10, 1), - dtype=dtype, - size=size, - input_zp=zp_min + 128, - input_sc=0.0784314, - output_zp=zp_min + 128, - output_sc=0.0784314, - coordinate_transformation_mode="half_pixel", - rounding_method="round_prefer_ceil", - ) - model = tei.make_ethosn_composite(model, "ethos-n.qnn_resize") - mod = tei.make_ethosn_partition(model) - tei.test_error(mod, {}, err_msg) + model = _get_model( + shape=(1, 10, 10, 1), + dtype=dtype, + size=size, + input_zp=zp_min + 128, + input_sc=0.0784314, + output_zp=zp_min + 128, + output_sc=0.0784314, + coordinate_transformation_mode="half_pixel", + rounding_method="round_prefer_ceil", + ) + model = tei.make_ethosn_composite(model, "ethos-n.qnn_resize") + mod = tei.make_ethosn_partition(model) + tei.test_error(mod, {}, err_msg) diff --git a/tests/python/contrib/test_ethosn/test_sigmoid.py b/tests/python/contrib/test_ethosn/test_sigmoid.py index ae8c301ff01a..bddd16049144 100644 --- a/tests/python/contrib/test_ethosn/test_sigmoid.py +++ b/tests/python/contrib/test_ethosn/test_sigmoid.py @@ -44,59 +44,59 @@ def _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) -def test_sigmoid(dtype): - """Compare Sigmoid output with TVM.""" - - trials = [ +@pytest.mark.parametrize( + "shape", + [ (1, 16, 16, 16), (1, 8, 8), - ] - + ], +) +def test_sigmoid(dtype, shape): + """Compare Sigmoid output with TVM.""" np.random.seed(0) - for shape in trials: - inputs = { - "a": tvm.nd.array( - np.random.randint( - np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype - ) - ), - } - outputs = [] - for npu in [False, True]: - for _ in range(1, 2): - if dtype == "uint8": - input_zp = 0 - output_zp = 0 - else: - input_zp = 127 - output_zp = -128 - model = _get_model(shape, input_zp, 0.02, output_zp, 1.0 / 256.0, dtype) - mod = tei.make_module(model, []) - outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu)) - tei.verify(outputs, dtype, 1) + inputs = { + "a": tvm.nd.array( + np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype) + ), + } + outputs = [] + for npu in [False, True]: + for _ in range(1, 2): + if dtype == "uint8": + input_zp = 0 + output_zp = 0 + else: + input_zp = 127 + output_zp = -128 + model = _get_model(shape, input_zp, 0.02, output_zp, 1.0 / 256.0, dtype) + mod = tei.make_module(model, []) + outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu)) + tei.verify(outputs, dtype, 1) -@requires_ethosn -@pytest.mark.parametrize("dtype", ["uint8", "int8"]) -def test_sigmoid_failure(dtype): - """Check Sigmoid error messages.""" - test_zp = 0 if dtype == "uint8" else -128 - trials = [ - ((2, 4, 4, 4), 64, 0.2, test_zp, 1 / 256, "batch size=2, batch size must = 1"), +@requires_ethosn +@pytest.mark.parametrize( + "shape,input_zp,input_sc,output_zp,output_sc,err_msg", + [ + ((2, 4, 4, 4), 64, 0.2, 0, 1 / 256, "batch size=2, batch size must = 1"), ( (1, 4, 4, 4), 64, 0.2, 3, 1, - f"output quantization params=(3, 1), must = ({test_zp}, 1/256)", + "output quantization params=(3, 1), must = (0, 1/256)", ), - ] + ], +) +def test_sigmoid_failure(shape, input_zp, input_sc, output_zp, output_sc, err_msg): + """Check Sigmoid error messages.""" + + dtype = "uint8" - for shape, input_zp, input_sc, output_zp, output_sc, err_msg in trials: - model = _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype) - model = tei.make_ethosn_composite(model, "ethos-n.qnn_sigmoid") - mod = tei.make_ethosn_partition(model) - tei.test_error(mod, {}, err_msg) + model = _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype) + model = tei.make_ethosn_composite(model, "ethos-n.qnn_sigmoid") + mod = tei.make_ethosn_partition(model) + tei.test_error(mod, {}, err_msg) diff --git a/tests/python/contrib/test_ethosn/test_split.py b/tests/python/contrib/test_ethosn/test_split.py index 7f8787afe947..afbc45a0805d 100644 --- a/tests/python/contrib/test_ethosn/test_split.py +++ b/tests/python/contrib/test_ethosn/test_split.py @@ -36,39 +36,37 @@ def _get_model(shape, dtype, splits, axis): @pytest.mark.skip("Split is not supported by the 3.0.1 version of the driver stack.") @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) -def test_split(dtype): - """Compare Split output with TVM.""" - - trials = [ +@pytest.mark.parametrize( + "shape,splits,axis", + [ ((1, 16, 16, 32), (2, 7, 10), 2), ((1, 12, 8, 16), 3, 1), - ] - + ], +) +def test_split(dtype, shape, splits, axis): + """Compare Split output with TVM.""" np.random.seed(0) - for shape, splits, axis in trials: - outputs = [] - inputs = { - "a": tvm.nd.array( - np.random.randint( - np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype - ) - ) - } - for npu in [False, True]: - model = _get_model(shape, dtype, splits, axis) - mod = tei.make_module(model, {}) - output_count = splits if isinstance(splits, int) else len(splits) + 1 - outputs.append(tei.build_and_run(mod, inputs, output_count, {}, npu=npu)) + + outputs = [] + inputs = { + "a": tvm.nd.array( + np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype) + ) + } + for npu in [False, True]: + model = _get_model(shape, dtype, splits, axis) + mod = tei.make_module(model, {}) + output_count = splits if isinstance(splits, int) else len(splits) + 1 + outputs.append(tei.build_and_run(mod, inputs, output_count, {}, npu=npu)) tei.verify(outputs, dtype, 0) @pytest.mark.skip("Split is not supported by the 3.0.1 version of the driver stack.") @requires_ethosn -def test_split_failure(): - """Check Split error messages.""" - - trials = [ +@pytest.mark.parametrize( + "shape,dtype,splits,axis,err_msg", + [ ((1, 4, 4, 4, 4), "uint8", 4, 2, "dimensions=5, dimensions must be <= 4;"), ((1, 4, 4, 4), "int16", 4, 2, "dtype='int16', dtype must be either uint8, int8 or int32;"), ((2, 4, 4, 4), "uint8", 4, 2, "batch size=2, batch size must = 1;"), @@ -81,9 +79,10 @@ def test_split_failure(): "Split along the channels dimension (axis 3) requires all output sizes " "(specified in splitInfo.m_Sizes) to be multiples of 16;", ), - ] - - for shape, dtype, splits, axis, err_msg in trials: - model = _get_model(shape, dtype, splits, axis) - mod = tei.make_ethosn_partition(model) - tei.test_error(mod, {}, err_msg) + ], +) +def test_split_failure(shape, dtype, splits, axis, err_msg): + """Check Split error messages.""" + model = _get_model(shape, dtype, splits, axis) + mod = tei.make_ethosn_partition(model) + tei.test_error(mod, {}, err_msg) diff --git a/tests/python/contrib/test_ethosn/test_topologies.py b/tests/python/contrib/test_ethosn/test_topologies.py index 19d7accadb6d..dc6a2ed086d4 100644 --- a/tests/python/contrib/test_ethosn/test_topologies.py +++ b/tests/python/contrib/test_ethosn/test_topologies.py @@ -237,8 +237,15 @@ def get_model(): @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) -def test_split_with_asym_concats(dtype): +@pytest.mark.parametrize( + "shape,splits,axis", + [ + ((1, 16, 16, 32), (2, 7, 10), 2), + ], +) +def test_split_with_asym_concats(dtype, shape, splits, axis): """Test a model with split and contatenates.""" + np.random.seed(0) def get_model(shape, dtype, splits, axis): a = relay.var("a", shape=shape, dtype=dtype) @@ -263,51 +270,43 @@ def get_model(shape, dtype, splits, axis): ) return relay.Tuple((con2, con1)) - trials = [ - ((1, 16, 16, 32), (2, 7, 10), 2), - ] - - np.random.seed(0) - for shape, splits, axis in trials: - outputs = [] - inputs = { - "a": tvm.nd.array( - np.random.randint( - np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype - ) - ) - } - for npu in [False, True]: - model = get_model(shape, dtype, splits, axis) - mod = tei.make_module(model, {}) + outputs = [] + inputs = { + "a": tvm.nd.array( + np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype) + ) + } + for npu in [False, True]: + model = get_model(shape, dtype, splits, axis) + mod = tei.make_module(model, {}) - expected_host_ops = 1 - npu_partitions = 2 + expected_host_ops = 1 + npu_partitions = 2 - # Mock inference is only supported when the whole graph is offloaded to the NPU - if ethosn_available() == Available.SW_ONLY: - tei.build( + # Mock inference is only supported when the whole graph is offloaded to the NPU + if ethosn_available() == Available.SW_ONLY: + tei.build( + mod, + {}, + npu=npu, + expected_host_ops=expected_host_ops, + npu_partitions=npu_partitions, + ) + else: + outputs.append( + tei.build_and_run( mod, + inputs, + 2, {}, npu=npu, expected_host_ops=expected_host_ops, npu_partitions=npu_partitions, ) - else: - outputs.append( - tei.build_and_run( - mod, - inputs, - 2, - {}, - npu=npu, - expected_host_ops=expected_host_ops, - npu_partitions=npu_partitions, - ) - ) + ) - if outputs: - tei.verify(outputs, dtype, 0) + if outputs: + tei.verify(outputs, dtype, 0) @pytest.mark.skip("Split is not supported by the 3.0.1 version of the driver stack.") From b3edb6e227be0dea73413d5780d15a4cbdc3d83b Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Tue, 6 Sep 2022 14:12:14 +0100 Subject: [PATCH 106/704] [Apps] Pin android_camera TensorFlow/Keras dependency version (#12710) At the moment, android camera is installing latest TF and Keras which is causing the following issue in CI: ``` File ".../keras/dtensor/lazy_variable.py", line 26, in from tensorflow.python.trackable import base as trackable ModuleNotFoundError: No module named 'tensorflow.python.trackable' ``` This patch fixes the versions in the last known working versions of both: TF 2.9.1 and Keras 2.9. --- apps/android_camera/models/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/android_camera/models/requirements.txt b/apps/android_camera/models/requirements.txt index 98aa53def46f..1deff2b3548b 100644 --- a/apps/android_camera/models/requirements.txt +++ b/apps/android_camera/models/requirements.txt @@ -1,4 +1,4 @@ -keras +keras==2.9 mxnet scipy -tensorflow \ No newline at end of file +tensorflow==2.9.1 \ No newline at end of file From 832cffa1c1729c88c799e81c3340a80fb4a48baa Mon Sep 17 00:00:00 2001 From: Christian Convey Date: Tue, 6 Sep 2022 11:06:03 -0400 Subject: [PATCH 107/704] [Hexagon][Runtime] Better support for 2-tier memory (#12574) - Introduce 'global.ddr' memory scope: - Like 'global', this allocates memory from the Hexagon SoC's DDR memory. - Like 'global.vtcm', the specified tensor shape must be 1d or 2d, where 2d indicates Hexagon's "indirect tensor" (i.e., discontiguous) allocation scheme. - Change memory-alignment strategy to always be 2048-byte aligned on Hexagon. (This can be refined in the future, but for now it ensures all allocations meet the strictest alignment requirements for any Hexagon operations.) --- src/runtime/hexagon/hexagon_buffer.cc | 17 ++-- src/runtime/hexagon/hexagon_device_api.cc | 47 +++++++--- .../contrib/test_hexagon/test_memory_alloc.py | 85 +++++++++++++++++++ 3 files changed, 126 insertions(+), 23 deletions(-) create mode 100644 tests/python/contrib/test_hexagon/test_memory_alloc.py diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc index 0fc71d8ac29c..f23317fd01ed 100644 --- a/src/runtime/hexagon/hexagon_buffer.cc +++ b/src/runtime/hexagon/hexagon_buffer.cc @@ -161,17 +161,16 @@ void* HexagonBuffer::GetPointer() { HexagonBuffer::StorageScope HexagonBuffer::GetStorageScope() const { return storage_scope_; } void HexagonBuffer::SetStorageScope(Optional scope) { - if (!scope.defined()) { + const std::string s = scope.value_or("global"); + + if (s == "global") { + storage_scope_ = StorageScope::kDDR; + } else if (s == "global.ddr") { storage_scope_ = StorageScope::kDDR; + } else if (s == "global.vtcm") { + storage_scope_ = StorageScope::kVTCM; } else { - if (scope.value() == "global") { - storage_scope_ = StorageScope::kDDR; - } else if (scope.value() == "global.vtcm") { - storage_scope_ = StorageScope::kVTCM; - } else { - CHECK(false) << "Encountered unknown HexagonBuffer storage scope: " - << std::string(scope.value()); - } + CHECK(false) << "Encountered unknown HexagonBuffer storage scope: " << std::string(s); } } diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index f22afca10bfa..cf384ae88db7 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -57,34 +57,53 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap CHECK(shape) << "shape array is null"; CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type; + // IMPORTANT NOTE! + // Hexagon treats "global" memory scope VERY DIFFERENTLY from all the others. + // + // With "global": + // - As with "global.ddr", this uses the target device's DDR memory. + // - The memory allocation must be a single, contiguous region of + // (virtual) memory addresses. + // - 'ndim' and 'shape' give the dimensions of the tensor to be stored + // in this allocation. There's no (practical) limit on the maximum + // rank (ndim) of the tensor. + // + // All other supported memory-scope names: + // - 'ndim' must be exactly 1 or 2: + // 1: A single, contiguous region of memory is requested. + // 2: A two-level memory allocation is required, suitable for storing a tensor + // in Hexagon's "indirect tensor" format: + // - shape[0] indicates the number of tensor-content memory allocations. + // - shape[1] indicates the size of each tensor-content memory allocation. if (!mem_scope.defined() || mem_scope.value() == "global") { return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope); } - // must be Hexagon device and VTCM scope after this point - CHECK_EQ(mem_scope.value(), "global.vtcm"); - CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type; + // NOTE: This check should be superfluous, but it's probably a good idea to leave it in + // until the AoT executor's multi-device dispatch code is mature. --cconvey 2022-08-26 + CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) + << "dev.device_type: " << dev.device_type << " DeviceName(" << dev.device_type + << "): " << DeviceName(dev.device_type) << ""; - size_t typesize = (dtype.bits / 8) * dtype.lanes; + CHECK(ndim >= 0 && ndim <= 2) + << "Hexagon Device API supports only 1d and 2d allocations, but received ndim = " << ndim; - size_t alignment = shape[ndim - 1] * typesize; - if (alignment < kHexagonAllocAlignment) { - alignment = kHexagonAllocAlignment; - } + const size_t typesize = (dtype.bits / 8) * dtype.lanes; if (ndim == 0) { - return hexbuffs.AllocateHexagonBuffer(typesize, alignment, mem_scope); + // Allocate storage for a single scalar value. + return hexbuffs.AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope); } else if (ndim == 1) { + // Allocate a single, contiguous memory region. size_t nbytes = shape[0] * typesize; - return hexbuffs.AllocateHexagonBuffer(nbytes, alignment, mem_scope); + return hexbuffs.AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope); } else if (ndim == 2) { + // Allocate the region(s) needed for Hexagon's indirect-tensor format. size_t nallocs = shape[0]; size_t nbytes = shape[1] * typesize; - return hexbuffs.AllocateHexagonBuffer(nallocs, nbytes, alignment, mem_scope); + return hexbuffs.AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment, mem_scope); } else { - LOG(FATAL) << "Hexagon Device API supports only 1d and 2d allocations, but received ndim = " - << ndim; - return nullptr; + return nullptr; // unreachable } } diff --git a/tests/python/contrib/test_hexagon/test_memory_alloc.py b/tests/python/contrib/test_hexagon/test_memory_alloc.py new file mode 100644 index 000000000000..fd948ea524f2 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_memory_alloc.py @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import os.path +import sys +import tempfile + +import numpy as np +import pytest + +import tvm +from tvm.script import tir as T + +from .infrastructure import allocate_hexagon_array + +_HEXAGON_TARGET = tvm.target.hexagon("v69", link_params=True) + + +@tvm.testing.fixture +def generated_func(shape, scope, dtype, axis_separators): + dim0, dim1 = shape + + @T.prim_func + def elwise(a: T.handle, b: T.handle): + A = T.match_buffer(a, shape, dtype=dtype, axis_separators=axis_separators) + B = T.match_buffer(b, shape, dtype=dtype, axis_separators=axis_separators) + + for i, j in T.grid(dim0, dim1): + with T.block("compute"): + B[i, j] = A[i, j] * T.cast(2, dtype=dtype) + + return elwise + + +class TestMemoryAlloc: + dtype = tvm.testing.parameter("int8") + shape = tvm.testing.parameter((128, 128)) + + (scope, axis_separators,) = tvm.testing.parameters( + ("global", []), + ("global.vtcm", []), + ("global.vtcm", [1]), + ("global.ddr", []), + ("global.ddr", [1]), + ) + + def test_global_axis_separator( + self, hexagon_session, generated_func, shape, dtype, scope, axis_separators + ): + mod1 = tvm.build( + generated_func, target=tvm.target.Target(_HEXAGON_TARGET, host=_HEXAGON_TARGET) + ) + mod2 = hexagon_session.load_module(mod1) + + a_np = np.ones(shape=shape, dtype=dtype) + a = allocate_hexagon_array( + hexagon_session.device, data=a_np, mem_scope=scope, axis_separators=axis_separators + ) + + b_np = np.zeros(shape=shape, dtype=dtype) + b = allocate_hexagon_array( + hexagon_session.device, data=b_np, mem_scope=scope, axis_separators=axis_separators + ) + + mod2(a, b) + tvm.testing.assert_allclose(a.numpy() * 2, b.numpy(), atol=1e-4, rtol=1e-4) + + +if __name__ == "__main__": + tvm.testing.main() From 744649e53bd32b53eb53020a111479facff3b88a Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Tue, 6 Sep 2022 10:31:39 -0700 Subject: [PATCH 108/704] [TIR][StorageRewrite] Allow in-place buffer reuse of non-flat memory (#12655) * [TIR][StorageRewrite] Allow in-place buffer reuse of non-flat memory Previously, shared buffer use was entirely disabled for non-flat memory, since the existing checks for shared memory assume flat 1-d spaces. This was enforced in `FindAlloc` and validated in `PrepareNewAlloc`. The validation in `PrepareNewAlloc` could trigger, if the buffer sharing was due to an in-place operation, and not through the `FindAlloc` function. In-place operations do not require N-d packing, nor do they introduce ambiguity in how different code generators may interpret non-flat physical indices. Therefore, this commit relaxes the validation in `PrepareNewAlloc`, allowing buffer reuse of non-flat buffers for in-place operations. * Update new StorageRewrite with correct allocate/buffer_decl usage --- src/tir/transforms/storage_rewrite.cc | 20 ++- .../test_tir_transform_storage_rewrite.py | 116 +++++++++++++++++- 2 files changed, 132 insertions(+), 4 deletions(-) diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc index 177017f9a245..67972ce67282 100644 --- a/src/tir/transforms/storage_rewrite.cc +++ b/src/tir/transforms/storage_rewrite.cc @@ -655,7 +655,25 @@ class StoragePlanRewriter : public StmtExprMutator { } } - if (e->allocs.size() == 1) { + bool all_allocs_identical = std::all_of( + e->allocs.begin() + 1, e->allocs.end(), [&](const AllocateNode* op) -> bool { + const AllocateNode* first = *e->allocs.begin(); + if (op->dtype != first->dtype) { + return false; + } + if (op->extents.size() != first->extents.size()) { + return false; + } + ExprDeepEqual expr_equal; + for (size_t i = 0; i < op->extents.size(); i++) { + if (!expr_equal(op->extents[i], first->extents[i])) { + return false; + } + } + return true; + }); + + if (all_allocs_identical) { // simply use the original allocation. e->new_alloc = Allocate(e->alloc_var, alloc_type, e->allocs[0]->extents, e->allocs[0]->condition, Evaluate(0)); diff --git a/tests/python/unittest/test_tir_transform_storage_rewrite.py b/tests/python/unittest/test_tir_transform_storage_rewrite.py index 581afef88942..533a835e0f9c 100644 --- a/tests/python/unittest/test_tir_transform_storage_rewrite.py +++ b/tests/python/unittest/test_tir_transform_storage_rewrite.py @@ -673,7 +673,11 @@ def func_rewritten(A: T.Buffer[(8,), "float32"]) -> None: tvm.ir.assert_structural_equal(mod["main"], func_rewritten) -class TestLetBufferRewrite(tvm.testing.CompareBeforeAfter): +class BaseCompare(tvm.testing.CompareBeforeAfter): + transform = tvm.tir.transform.StorageRewrite() + + +class TestLetBufferRewrite(BaseCompare): """StorageRewrite replaces the bound var of backing allocations If StorageRewrite replaces the backing variable of an array, such @@ -684,8 +688,6 @@ class TestLetBufferRewrite(tvm.testing.CompareBeforeAfter): handled. """ - transform = tvm.tir.transform.StorageRewrite() - def before() -> None: A_data: T.Ptr[T.int32] = T.call_extern("dummy_func", dtype="handle") A = T.buffer_decl([8], "int32", data=A_data) @@ -697,5 +699,113 @@ def expected() -> None: A[0] = T.broadcast(42, 8) +class TestRewriteInPlaceUseOfNonFlatBuffer(BaseCompare): + """A non-flat buffer may be re-used for in-place operations""" + + def before(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"]): + B_data = T.allocate( + [16, 16], + dtype="float32", + scope="global", + ) + B = T.buffer_decl( + [16, 16], + dtype="float32", + axis_separators=[1], + data=B_data, + ) + C_data = T.allocate( + [16, 16], + dtype="float32", + scope="global", + ) + C = T.buffer_decl( + [16, 16], + dtype="float32", + axis_separators=[1], + data=C_data, + ) + + for i, j in T.grid(16, 16): + B[i, j] = A[i, j] + + for i, j in T.grid(16, 16): + C[i, j] = 2.0 * B[i, j] + + for i, j in T.grid(16, 16): + D[i, j] = C[i, j] + + def expected(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"]): + B_data = T.allocate( + [16, 16], + dtype="float32", + scope="global", + ) + B = T.buffer_decl([16, 16], dtype="float32", axis_separators=[1], data=B_data) + C = T.buffer_decl( + [16, 16], + dtype="float32", + axis_separators=[1], + data=B.data, + ) + + for i, j in T.grid(16, 16): + B[i, j] = A[i, j] + + for i, j in T.grid(16, 16): + C[i, j] = 2.0 * B[i, j] + + for i, j in T.grid(16, 16): + D[i, j] = C[i, j] + + +class TestNoRewriteOfSharedNonFlatBuffer(BaseCompare): + """In general, sharing of non-flat buffer isn't supported + + The current packing algorithms in StorageRewrite assume a flat + memory space, and do not support packing of N-d buffers. For + buffers with axis separators, normal buffer sharing should be + disabled. + + Like TestRewriteInPlaceUseOfNonFlatBuffer, except that B and C do + not have matching shapes. + """ + + def before(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"]): + B_data = T.allocate( + [16, 16], + dtype="float32", + scope="global", + ) + B = T.buffer_decl( + [16, 16], + dtype="float32", + axis_separators=[1], + data=B_data, + ) + C_data = T.allocate( + [20, 20], + dtype="float32", + scope="global", + ) + C = T.buffer_decl( + [20, 20], + dtype="float32", + axis_separators=[1], + data=C_data, + ) + + for i, j in T.grid(16, 16): + B[i, j] = A[i, j] + + for i, j in T.grid(16, 16): + C[i, j] = 2.0 * B[i, j] + + for i, j in T.grid(16, 16): + D[i, j] = C[i, j] + + expected = before + + if __name__ == "__main__": tvm.testing.main() From d4201a9d8e56a391231cb71bf80d82ab36a9dfaf Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Tue, 6 Sep 2022 10:33:58 -0700 Subject: [PATCH 109/704] [COMMUNITY] ekalda -> Committer (#12715) --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 01cf7058a069..2231fac66596 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -46,6 +46,7 @@ We do encourage everyone to work anything they are interested in. - [Chenfan Jia](https://github.com/jcf94): @jcf94 - auto_scheduler - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler - [Manupa Karunaratne](https://github.com/manupa-arm): @manupa-arm - ethos-u, memory planner +- [Elen Kalda](https://github.com/ekalda): @ekalda - ethos-u, arm - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - relay - [Tristan Konolige](https://github.com/tkonolige): @tkonolige - profiling, relay, tir, runtime - [Ruihang Lai](https://github.com/MasterJH5574): @MasterJH5574 - tir, tvm-script From 141b17b23a801799576bab02b0654d062e071380 Mon Sep 17 00:00:00 2001 From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com> Date: Tue, 6 Sep 2022 22:45:53 +0300 Subject: [PATCH 110/704] [Hexagon] Add optimized schedule for nn.pad (#12714) Motivation: In case of quantized models nn.pad operation typically is not fused with QNN ops and lives as a standalone operation. In this case it uses default injective schedule for Hexagon target and it is not optimized very well (based on analysis of real models like ResNet50 INT8). What was done: New schedule for Pad operation was implemented instead of default injective schedule. For Hexagon target injective schedule does fusion of all axis and vectorization on 128/64/32 (depends on dtype). It works fine for Add, Sub, etc... but not for Pad. New optimized schedule does these steps (fusion+vectorization) only if last tensor dimension is divisible by 128/64/32 (depends on dtype). It was done only for Hexagon, for other targets (x86, cuda, etc.) there is no changes and it uses default injective schedule. Benchmark results on Snapdragon 888: 4d NHWC layout with ((0, 0), (1, 1), (1, 1), (0, 0)) padding, "uint8" dtype: shape | default schedule, ms | optimized schedule, ms | speedup | -------------------|----------------------|------------------------|-------------------| (1, 112, 112, 32) | 10,03 | 0.2 | 50.1x times | (1, 56, 56, 128) | 0,099 | 0,085 | ~1x (no speedup) | ---------------------------------------------------------------------------------------| 4d NCHW layout with ((0, 0), (0, 0), (1, 1), (1, 1)) padding, "uint8" dtype: shape | default schedule, ms | optimized schedule, ms | speedup | -------------------|----------------------|------------------------|-------------------| (1, 128, 56, 56) | 10.96 | 1.38 | 7.9x times | (1, 32, 126, 126) | 1.66 | 1.58 | ~1x (no speedup) | (1, 32, 128, 128) | 13.98 | 2.66 | 5.25x times | ---------------------------------------------------------------------------------------| 5d NCHWc layout with ((0, 0), (0, 0), (1, 1), (1, 1), (0, 0)) padding, "uint8" dtype: shape | default schedule, ms | optimized schedule, ms | speedup | -------------------|----------------------|------------------------|-------------------| (1, 4, 56, 56, 32) | 6.39 | 0.29 | 22x times | (1, 56, 56, 128) | 0.15 | 0.15 | ~1x (no speedup) | ---------------------------------------------------------------------------------------| Summary: For some input tensors we get up to 50x times speedup, for other performance is the same. No performance degradations were detected. --- python/tvm/relay/op/nn/_nn.py | 2 +- python/tvm/relay/op/strategy/generic.py | 8 +++ python/tvm/relay/op/strategy/hexagon.py | 7 +++ python/tvm/topi/hexagon/__init__.py | 1 + python/tvm/topi/hexagon/pad.py | 51 +++++++++++++++++ .../contrib/test_hexagon/topi/test_pad.py | 57 +++++++++++++++++++ 6 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 python/tvm/topi/hexagon/pad.py create mode 100644 tests/python/contrib/test_hexagon/topi/test_pad.py diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index ff213f098319..90a94c422992 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -701,7 +701,7 @@ def compute_upsampling3d(attrs, inputs, out_dtype): # pad -reg.register_broadcast_schedule("nn.pad") +reg.register_schedule("nn.pad", strategy.schedule_pad) # mirror_pad diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index 74abd9281f87..6ab281abeb37 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -205,6 +205,14 @@ def schedule_lrn(attrs, outs, target): return topi.generic.schedule_lrn(outs) +# pad +@generic_func +def schedule_pad(attrs, outs, target): + """Schedule PAD op""" + with target: + return schedule_injective(attrs, outs, target) + + # bitpack @generic_func def schedule_bitpack(attrs, outs, target): diff --git a/python/tvm/relay/op/strategy/hexagon.py b/python/tvm/relay/op/strategy/hexagon.py index be01ee50fba8..13c808f96b95 100644 --- a/python/tvm/relay/op/strategy/hexagon.py +++ b/python/tvm/relay/op/strategy/hexagon.py @@ -168,6 +168,13 @@ def schedule_concatenate_hexagon(attrs, outs, target): return topi.hexagon.schedule_injective(outs) +@schedule_pad.register("hexagon") +def schedule_pad_hexagon(attrs, outs, target): + """Schedule pad ops for Hexagon""" + with target: + return topi.hexagon.schedule_pad(outs) + + @schedule_pool.register("hexagon") def schedule_pool_hexagon(attrs, outs, target): """Schedule pool ops for Hexagon""" diff --git a/python/tvm/topi/hexagon/__init__.py b/python/tvm/topi/hexagon/__init__.py index a3768a6e809e..295152d11631 100644 --- a/python/tvm/topi/hexagon/__init__.py +++ b/python/tvm/topi/hexagon/__init__.py @@ -23,6 +23,7 @@ from .conv2d import * from .dense import * from .injective import * +from .pad import * from .pooling import * from .reduce import * from .resize2d import * diff --git a/python/tvm/topi/hexagon/pad.py b/python/tvm/topi/hexagon/pad.py new file mode 100644 index 000000000000..c744d47fefa1 --- /dev/null +++ b/python/tvm/topi/hexagon/pad.py @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Schedule for nn.pad operator""" + +import tvm + +import numpy as np + + +def schedule_pad(outs): + """Schedule for pad op. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of injective in the format + of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs + s = tvm.te.create_schedule([x.op for x in outs]) + tvm.te.schedule.AutoInlineInjective(s) + + # Fuse axes and vectorize only if last output tensor dimension is divisible by a factor: + factor = 128 // np.dtype(outs[0].dtype).itemsize + last_dim = outs[0].shape[-1] + if last_dim % factor == 0 and last_dim // factor >= 0: + fused = s[outs[0]].fuse(*outs[0].op.axis) + _, inner = s[outs[0]].split(fused, factor=factor) + s[outs[0]].vectorize(inner) + + return s diff --git a/tests/python/contrib/test_hexagon/topi/test_pad.py b/tests/python/contrib/test_hexagon/topi/test_pad.py new file mode 100644 index 000000000000..631cb979dcbd --- /dev/null +++ b/tests/python/contrib/test_hexagon/topi/test_pad.py @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Test code for reduce""" +import numpy as np + +import tvm +from tvm import te, topi +from tvm.contrib.hexagon.session import Session +from tvm.topi.utils import get_const_tuple + + +@tvm.testing.requires_hexagon +def test_nn_pad(hexagon_session: Session): + dtype = "uint8" + in_shape = (1, 56, 56, 32) + + data_in = np.ones(in_shape).astype(dtype) + + A = te.placeholder(shape=in_shape, name="A", dtype=dtype) + + C = topi.nn.pad(A, [0, 1, 1, 0], [0, 1, 1, 0], pad_value=0) + + target_hexagon = tvm.target.hexagon("v68") + with tvm.target.Target(target_hexagon): + fschedule = topi.hexagon.schedule_pad + s = fschedule(C) + + func = tvm.build(s, [A, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="pad") + mod = hexagon_session.load_module(func) + + dev = hexagon_session.device + a = tvm.nd.array(data_in, dev) + b = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) + mod["pad"](a, b) + + # Reference numpy pad output + ref_out = np.pad(data_in, pad_width=((0, 0), (1, 1), (1, 1), (0, 0))) + + tvm.testing.assert_allclose(b.numpy(), ref_out) + + +if __name__ == "__main__": + tvm.testing.main() From da48e13b66fa053578815343c3f247f47364d0bb Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Tue, 6 Sep 2022 21:10:36 +0100 Subject: [PATCH 111/704] [TVMC] Run module once by default (#12713) * [TVMC] Run module once by default Currently executing `tvmc run module.tar` will run the input model twice. For benchmaking this is to be expected as the first run is used to prime caches etc before taking a measurement. However, this seems a bit unintuitive to have as default, especially when benchmarking is not always intended. In this sense, this commit aims to amend the number of runs for the default: `tvmc run module.tar` to a single run. After inspection, this seems to be down to the use of the `.benchmark()` method which runs (1 + repeat * number) executions in total. This means that at least two runs are required (i.e. when repeat=1, number=1). It also seems that it is only necessary to benchmark the model when `--print-time` has been set from the CLI POV. From the python interface POV, benchmarking is always run, but this may not always be necessary. This commit makes use of the `.run()` method to singularly execute the model by default. From the CLI this will be used when `--print-time` is set to False whereas from the python interface this will be used when `benchmark=False`. Otherwise, the `.benchmark()` method will be used as before. Complementary to this change `repeat`, `number` and `end_to_end` parameters are only used when either `--print-time` or `benchmark` are set to True - and the documentation has been updated to indicate this. Change-Id: I18a38a9d430d660264f7fce5caf0779aa059fed3 * improve documentation with number of exectuions when benchmarking Change-Id: Iecf557594420fcc9f3abcec5ce7d952db2c94271 --- python/tvm/driver/tvmc/runner.py | 58 +++++++++++++++++-------- tests/python/driver/tvmc/conftest.py | 16 +++++++ tests/python/driver/tvmc/test_model.py | 4 +- tests/python/driver/tvmc/test_runner.py | 42 ++++++++++++++++++ 4 files changed, 101 insertions(+), 19 deletions(-) diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py index afb198ce1c6e..216f3bb2653b 100644 --- a/python/tvm/driver/tvmc/runner.py +++ b/python/tvm/driver/tvmc/runner.py @@ -92,7 +92,8 @@ def add_run_parser(subparsers, main_parser, json_params): parser.add_argument( "--print-time", action="store_true", - help="record and print the execution time(s). (non-micro devices only)", + help="record and print the execution time(s). Enabling print-time will result " + " in (1 + repeat * number) executions of the model. (non-micro devices only)", ) parser.add_argument( "--print-top", @@ -112,13 +113,24 @@ def add_run_parser(subparsers, main_parser, json_params): "--end-to-end", action="store_true", help="Measure data transfers as well as model execution. This can provide a " - "more realistic performance measurement in many cases.", + "more realistic performance measurement in many cases. Requires " + "'--print-time' to be specified.", ) parser.add_argument( - "--repeat", metavar="N", type=int, default=1, help="run the model n times. Defaults to '1'" + "--repeat", + metavar="N", + type=int, + default=1, + help="How many times to repeat the run. Requires '--print-time' to be " + "specified. Defaults to '1'", ) parser.add_argument( - "--number", metavar="N", type=int, default=1, help="repeat the run n times. Defaults to '1'" + "--number", + metavar="N", + type=int, + default=1, + help="The number of runs to measure within each repeat. Requires " + "'--print-time' to be specified. Defaults to '1'", ) parser.add_argument( "--rpc-key", @@ -273,6 +285,7 @@ def drive_run(args): rpc_key=args.rpc_key, inputs=inputs, fill_mode=args.fill_mode, + benchmark=args.print_time, repeat=args.repeat, number=args.number, profile=args.profile, @@ -462,6 +475,7 @@ def run_module( rpc_key: Optional[str] = None, inputs: Optional[Dict[str, np.ndarray]] = None, fill_mode: str = "random", + benchmark: bool = False, repeat: int = 10, number: int = 10, profile: bool = False, @@ -495,23 +509,26 @@ def run_module( The fill-mode to use when generating data for input tensors. Valid options are "zeros", "ones" and "random". Defaults to "random". + benchmark : bool, optional + Whether to benchmark the execution of the module. Enabling benchmark will + result in (1 + repeat * number) executions of the model. repeat : int, optional - How many times to repeat the run. + How many times to repeat the run. Requires `benchmark` to be set to True. number : int, optional The number of runs to measure within each repeat. + Requires `benchmark` to be set to True. profile : bool Whether to profile the run with the debug executor. end_to_end : bool Whether to measure the time of memory copies as well as model execution. Turning this on can provide a more realistic estimate of how long running the model in production would take. + Requires `benchmark` to be set to True. Returns ------- - outputs : dict - a dictionary with output tensors, generated by the module - times : list of str - execution times generated by the time evaluator + TVMCResult + The results of the run, including the output data. """ if not isinstance(tvmc_package, TVMCPackage): raise TVMCException( @@ -605,14 +622,19 @@ def run_module( exe = vm.VirtualMachine(lib, dev) exe_outputs = exe.invoke("main", **input_tensor) - times = exe.benchmark( - dev, - **input_tensor, - func_name="main", - repeat=repeat, - number=number, - end_to_end=end_to_end, - ) + + if benchmark: + times = exe.benchmark( + dev, + **input_tensor, + func_name="main", + repeat=repeat, + number=number, + end_to_end=end_to_end, + ) + else: + exe.run(**input_tensor) + times = [] # Special handling if the output only has a single value if not isinstance(exe_outputs, list): @@ -662,7 +684,7 @@ def run_module( # This print is intentional print(report) - if device == "micro": + if not benchmark or device == "micro": # TODO(gromero): Fix time_evaluator() for micro targets. Once it's # fixed module.benchmark() can be used instead and this if/else can # be removed. diff --git a/tests/python/driver/tvmc/conftest.py b/tests/python/driver/tvmc/conftest.py index 48b465e507ae..8009448bff77 100644 --- a/tests/python/driver/tvmc/conftest.py +++ b/tests/python/driver/tvmc/conftest.py @@ -192,6 +192,22 @@ def model_compiler(model_file, **overrides): return model_compiler +@pytest.fixture +def relay_compile_model(tmpdir_factory): + """Support function that returns a TFLite compiled module""" + + def model_compiler(model_file, shape_dict, **overrides): + package_path = tmpdir_factory.mktemp("data").join("mock.tar") + tvmc_model = tvmc.frontends.load_model( + model_file, model_format="relay", shape_dict=shape_dict + ) + args = {"target": "llvm", **overrides} + return tvmc.compiler.compile_model(tvmc_model, package_path=package_path, **args) + + # Returns a TVMCPackage + return model_compiler + + @pytest.fixture(scope="session") def imagenet_cat(tmpdir_factory): tmpdir_name = tmpdir_factory.mktemp("data") diff --git a/tests/python/driver/tvmc/test_model.py b/tests/python/driver/tvmc/test_model.py index fb1f718c1bed..4d937212e9cc 100644 --- a/tests/python/driver/tvmc/test_model.py +++ b/tests/python/driver/tvmc/test_model.py @@ -45,7 +45,9 @@ def test_tvmc_workflow(use_vm, keras_simple): ) input_dict = {"input_1": np.random.uniform(size=(1, 32, 32, 3)).astype("float32")} - result = tvmc.run(tvmc_package, device="cpu", end_to_end=True, inputs=input_dict) + result = tvmc.run( + tvmc_package, device="cpu", end_to_end=True, benchmark=True, inputs=input_dict + ) assert type(tvmc_model) is TVMCModel assert type(tvmc_package) is TVMCPackage assert type(result) is TVMCResult diff --git a/tests/python/driver/tvmc/test_runner.py b/tests/python/driver/tvmc/test_runner.py index f0d363dc59ac..5e6386614b1c 100644 --- a/tests/python/driver/tvmc/test_runner.py +++ b/tests/python/driver/tvmc/test_runner.py @@ -87,6 +87,7 @@ def test_run_tflite_module__with_profile__valid_input( result = tvmc.run( tflite_compiled_model, inputs=input_dict, + benchmark=True, hostname=None, device="cpu", profile=True, @@ -145,3 +146,44 @@ def test_run_tflite_module_with_rpc( ), "tiger cat is expected in the top-5 for mobilenet v1" assert isinstance(result.outputs, dict) assert "output_0" in result.outputs.keys() + + +@pytest.mark.parametrize("use_vm", [True, False]) +@pytest.mark.parametrize( + "benchmark,repeat,number,expected_len", [(False, 1, 1, 0), (True, 1, 1, 1), (True, 3, 2, 3)] +) +def test_run_relay_module__benchmarking( + use_vm, + benchmark, + repeat, + number, + expected_len, + relay_text_conv2d, + relay_compile_model, +): + """Check the length of the results from benchmarking is what is expected by expected_len.""" + shape_dict = {"data": (1, 3, 64, 64), "weight": (3, 3, 5, 5)} + input_dict = { + "data": np.random.randint(low=0, high=10, size=shape_dict["data"], dtype="uint8"), + "weight": np.random.randint(low=0, high=10, size=shape_dict["weight"], dtype="int8"), + } + + tflite_compiled_model = relay_compile_model( + relay_text_conv2d, shape_dict=shape_dict, use_vm=use_vm + ) + result = tvmc.run( + tflite_compiled_model, + inputs=input_dict, + hostname=None, + device="cpu", + benchmark=benchmark, + repeat=repeat, + number=number, + ) + + # When no benchmarking is used, an empty list is used to + # represent an absence of results. + if isinstance(result.times, list): + assert len(result.times) == expected_len + else: + assert len(result.times.results) == expected_len From 85bf80c822ec930939eabba1dd8a774c88d88bdd Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Wed, 7 Sep 2022 04:00:24 -0300 Subject: [PATCH 112/704] [Docs] Add Commit Message Guideline (#12689) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds the Commit Message Guideline text to Apache TVM documentation in ./docs/contribute/pull_request.rst, under section 'Submit a Pull Request', below subsection 'Guidelines', as a subsection named “Commit Message Guideline”. The text in the second-last item in subsection 'Guidelines' that mentions PR tags is also updated to refer to this guideline. This documentation will help guide contributors on how to write good commit messages when submitting code / creating Pull Requests, in accordance with RFC-0088: https://github.com/apache/tvm-rfcs/blob/main/rfcs/0088-commit-message-guideline.md --- docs/contribute/pull_request.rst | 113 ++++++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 1 deletion(-) diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst index 81852a212610..7b5509be0aa9 100644 --- a/docs/contribute/pull_request.rst +++ b/docs/contribute/pull_request.rst @@ -62,7 +62,12 @@ Guidelines - Add test-cases to cover the new features or bugfix the patch introduces. - Document the code you wrote, see more at :ref:`doc_guide` - `Create a pull request `_ and fix the problems reported by CI checks. -- Request code reviews from other contributors and improve your patch according to their reviews by ``@``-ing them in your pull request. Tags in PR titles will automatically tag subscribed users, so make sure to put relevant topics in your PR titles (e.g. ``[microTVM] a cool change`` and not ``a cool change for microTVM``). +- Request code reviews from other contributors and improve your patch according + to their reviews by ``@``-ing them in your pull request. Tags in PR titles + will automatically tag subscribed users, so make sure to put relevant topics + in your PR titles (e.g. ``[microTVM] Add a cool change`` and not ``a cool change for microTVM``). + Please see the Commit Message Guideline below on the guidelines about the tags + in a PR/commit title and how to write good PR/commit messages. - To get your code reviewed quickly, we encourage you to help review others' code so they can do the favor in return. - Code review is a shepherding process that helps to improve contributor's code quality. @@ -72,6 +77,112 @@ Guidelines - The PR can be merged after the reviewers approve the pull request. +Commit Message Guideline +------------------------ + +Apache TVM uses the Github (GH) platform for patch submission and code review +via Pull Requests (PRs). The final commit (title and body) that is merged into +the Apache TVM main tree is composed of the PR's title and body and must be kept +updated and reflecting the new changes in the code as per the reviews and +discussions. + +Although these guidelines apply essentially to the PRs’ title and body messages, +because GH auto-generates the PR’s title and body from the commits on a given +branch, it’s recommended to follow these guidelines right from the beginning, +when preparing commits in general to be submitted to the Apache TVM project. +This will ease the creation of a new PR, avoiding rework, and also will help the +review. + +The rules below will help to achieve uniformity that has several benefits, both +for review and for the code base maintenance as a whole, helping you to write +commit messages with a good quality suitable for the Apache TVM project, +allowing fast log searches, bisecting, and so on. + +*PR/commit title*: + + - Guarantee a title exists (enforced); + - Don’t use Github usernames in the title, like @username (enforced); + - A tag must be present as a hint about what component(s) of the code + the PRs / commits “touch” (enforced). For example [BugFix], [CI], [microTVM], + and [TVMC]. Tags go between square brackets and appear first in the title. If + more than one tag exist, multiple brackets should be used, like [BugFix][CI]. + The case recommended for tags, in geral, is the upper camel case. For example, + prefer the forms [Fix], [BugFix], and [Docker] instead of [fix], [bug_fix], + and [docker]. Acronyms should be kept as such so, for example, use [CI] and + [TVMC] instead of [ci] and [tvmc]. Tags help reviewers to identify the PRs + they can/want to review and also help the release folks when generating the + release notes; + - Use an imperative mood. Avoid titles like “Added operator X” and “Updated + image Y in the CI”, instead use the forms “Add feature X” and “Update image Y + in the CI” instead; + - Observe proper use of caps at the beginning (uppercase for the first letter) + and for acronyms, like, for instance, TVM, FVP, OpenCL. Hence instead of + “fix tvm use of opencl library”, write it as “Fix TVM use of OpenCL library”; + - Do not put a period at the end of the title. + +*PR/commit body*: + + - Guarantee a body exists (enforced); + - Don’t use Github usernames in body text, like @username (enforced); + - Avoid “bullet” commit message bodies: “bullet” commit message bodies are not + bad per se, but “bullet” commit messages without any description or + explanation is likely as bad as commits without any description, rationale, + or explanation in the body. + +For minor deviations from these guidelines, the community will normally favor +reminding the contributor of this policy over reverting or blocking a commmit / +PR. + +Commits and PRs without a title and/or a body are not considered minor +deviations from these guidelines and hence must be avoided. + +Most importantly, the contents of the commit message, especially the body, +should be written to convey the intention of the change, so it should avoid +being vague. For example, commits with a title like “Fix”, “Cleanup”, and +“Fix flaky test” and without any body text should be avoided. Also, for the +review, it will leave the reviewer wondering about what exactly was fixed or +changed and why the change is necessary, slowing the review. + +Below is an example that can be used as a model: + +:: + + [microTVM] Zephyr: Remove zephyr_board option from build, flash, and open_transport methods + + Currently it’s necessary to pass the board type via ‘zephyr_board’ option to + the Project API build, flash, and open_transport methods. + + However, since the board type is already configured when the project is + created (i.e. when the generate_project method is called), it’s possible to + avoid this redundancy by obtaining the board type from the project + configuration files. + + This commit adds code to obtain the board type from the project CMake files, + removing this option from build, flash, and open_transport methods, so it’s + only necessary to specify the ‘zephyr_board’ option when calling + generate_project. + + This commit also moves the ‘verbose’ and ‘west_cmd’ options from ‘build’ + method to ‘generate_project’, reducing further the number of required options + when building a project, since the ‘build’ method is usually called more often + than the ‘generate_project’. + +After a new PR is created and the review starts it’s common that reviewers will +request changes. Usually the author will address the reviewers’ comments and +push additional commits on top of the initial ones. For these additional commits +there is no recommendation regarding the commit messages. However if the +additional commits render the PR title and/or body outdated then it's the +author's responsibility to keep the PR title and body in sync with new changes +in the code and updated the PR title and body accordingly (remember that the PR +title and body will be used to compose the final commit message that will land +in the main tree). + +Committers will seek to fix any issues with the commit message prior to +committing but they retain the right to inform the author of the rules and +encourage them to follow them in future. Also, they retain the right to ask to +the author to update the PR title and/or body when they are not correctly +updated or fixed. + CI Environment -------------- We use Docker images to create stable CI environments that can be deployed to multiple machines. From 6cd31e7bf1d9fed7e2e9f5de1b725d1fdc5a4659 Mon Sep 17 00:00:00 2001 From: "yin.changsheng" Date: Wed, 7 Sep 2022 15:03:47 +0800 Subject: [PATCH 113/704] [TIR] Fix pragma_loop_partition_hint attrs should check it's value (#12699) Current LoopPartition doesn't check the value of attribute key "pragma_loop_partition_hint". Whatever I set pragma_loop_partition_hint to True or False, the result is same, which is confused for debug. This PR fix pragma_loop_partition_hint attribute key should check it's value. --- src/tir/transforms/loop_partition.cc | 17 ++++++++++------- .../test_tir_transform_loop_partition.py | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc index 6ecc6459b904..d410f8cfa471 100644 --- a/src/tir/transforms/loop_partition.cc +++ b/src/tir/transforms/loop_partition.cc @@ -139,14 +139,16 @@ class CandidateSelector final : public StmtExprVisitor { return; } } else if (op->attr_key == attr::pragma_loop_partition_hint) { - const VarNode* var = nullptr; - if (op->node->IsInstance()) { - var = op->node.as(); - } else if (op->node->IsInstance()) { - var = op->node.as()->var.get(); + if (analyzer_.CanProve(op->value)) { + const VarNode* var = nullptr; + if (op->node->IsInstance()) { + var = op->node.as(); + } else if (op->node->IsInstance()) { + var = op->node.as()->var.get(); + } + ICHECK(var); + partition_hint_vars.insert(var); } - ICHECK(var); - partition_hint_vars.insert(var); } StmtExprVisitor::VisitStmt_(op); } @@ -191,6 +193,7 @@ class CandidateSelector final : public StmtExprVisitor { bool no_split_{false}; bool partition_const_loop_{false}; std::unordered_map record_; + arith::Analyzer analyzer_; }; // Finder try best to find partitions for hinted vars diff --git a/tests/python/unittest/test_tir_transform_loop_partition.py b/tests/python/unittest/test_tir_transform_loop_partition.py index 86f2b6696b3d..b6e8d92f8d39 100644 --- a/tests/python/unittest/test_tir_transform_loop_partition.py +++ b/tests/python/unittest/test_tir_transform_loop_partition.py @@ -559,7 +559,7 @@ def test_explicit_partition_hint(): C = te.compute((32,), lambda i: te.if_then_else(i < 16, A[i], B[i]), name="C") s = te.create_schedule(C.op) s.normalize() - s[C].pragma(s[C].op.axis[0], "loop_partition_hint") + s[C].pragma(s[C].op.axis[0], "loop_partition_hint", True) mod = tvm.driver.build_module.schedule_to_module(s, [A, B, C], "main", None) with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}): mod = tvm.tir.transform.StorageFlatten(64)(mod) From 291dd2f06331342f5c89216d5d211cb61fe3d19f Mon Sep 17 00:00:00 2001 From: cery999 <112694109+cery999@users.noreply.github.com> Date: Wed, 7 Sep 2022 15:06:31 +0800 Subject: [PATCH 114/704] support false-positive fast math (#12702) --- include/tvm/topi/elemwise.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/tvm/topi/elemwise.h b/include/tvm/topi/elemwise.h index fc9ab139887e..f26105cb180b 100644 --- a/include/tvm/topi/elemwise.h +++ b/include/tvm/topi/elemwise.h @@ -81,7 +81,7 @@ TOPI_DECLARE_UNARY_OP(isinf); inline Tensor fast_tanh_float(const Tensor& in, std::string name, std::string tag) { // Clamp the inputs to the range [-9, 9] since anything outside // this range is +/-1.0f in single-precision. - auto x = maximum(minimum(in, make_const(in->dtype, 9.0)), make_const(in->dtype, -9.0)); + auto x = maximum(make_const(in->dtype, -9.0), minimum(make_const(in->dtype, 9.0), in)); // The monomial coefficients of the numerator polynomial (odd). auto alpha_1 = make_const(in->dtype, 4.89352455891786e-03); From b55ffcd18b049ae7a76e02d561535530f384c5d8 Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Wed, 7 Sep 2022 10:05:52 +0100 Subject: [PATCH 115/704] [ETHOSN] Add support for transpose convolution (#12674) Adds support for offloading transpose convolution with an optional bias to the NPU. Co-authored-by: Samuel Panijel Co-authored-by: Leo Blonk --- python/tvm/relay/op/contrib/ethosn.py | 18 ++ src/relay/backend/contrib/ethosn/codegen.cc | 39 +++ .../backend/contrib/ethosn/codegen_ethosn.h | 1 + .../contrib/ethosn/convert_equivalent.cc | 15 +- .../backend/contrib/ethosn/ethosn_api.cc | 126 ++++++++++ src/relay/backend/contrib/ethosn/ethosn_api.h | 23 ++ .../contrib/test_ethosn/infrastructure.py | 43 ++++ .../python/contrib/test_ethosn/test_conv2d.py | 21 +- .../test_ethosn/test_conv2d_transpose.py | 234 ++++++++++++++++++ 9 files changed, 487 insertions(+), 33 deletions(-) create mode 100644 tests/python/contrib/test_ethosn/test_conv2d_transpose.py diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py index a4e9d9647c95..5129ed9ffaef 100644 --- a/python/tvm/relay/op/contrib/ethosn.py +++ b/python/tvm/relay/op/contrib/ethosn.py @@ -233,6 +233,16 @@ def qnn_add_pattern(): return input_is_left | input_is_right | two_inputs + def qnn_conv2d_transpose_pattern(): + pattern = is_op("qnn.conv2d_transpose")( + wildcard(), is_constant(), is_constant(), is_constant(), is_constant(), is_constant() + ).has_attr({"data_layout": "NHWC"}) + pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant())) + pattern = is_op("qnn.requantize")( + pattern, is_constant(), is_constant(), is_constant(), is_constant() + ) + return pattern + def check_conv2d(extract): """Check if a conv2d is supported by Ethos-N.""" if not ethosn_available(): @@ -261,6 +271,13 @@ def check_mean(extract): return _ethosn.mean(extract) + def check_conv2d_transpose(extract): + """Check if conv2d_transpose is supported by Ethos-N.""" + if not ethosn_available(): + return False + + return _ethosn.conv2d_transpose(extract) + def check_sigmoid(extract): """Check if a sigmoid is supported by Ethos-N.""" if not ethosn_available(): @@ -326,6 +343,7 @@ def check_add(extract): ("ethos-n.qnn_mul", qnn_mul_pattern(), check_mul), ("ethos-n.qnn_add", qnn_add_pattern(), check_add), ("ethos-n.qnn_conv2d", qnn_conv_pattern(), check_conv2d), + ("ethos-n.qnn_conv2d_transpose", qnn_conv2d_transpose_pattern(), check_conv2d_transpose), ("ethos-n.qnn_avg_pool2d", qnn_avg_pool2d_pattern(), check_avg_pool2d), ("ethos-n.qnn_sigmoid", qnn_sigmoid_pattern(), check_sigmoid), ("ethos-n.qnn_fc", qnn_fc_pattern(), check_fc), diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc index 69672a143585..c7109b754d2b 100644 --- a/src/relay/backend/contrib/ethosn/codegen.cc +++ b/src/relay/backend/contrib/ethosn/codegen.cc @@ -125,6 +125,10 @@ void InferTensorsVisitor::InferCall(const CallNode* cn) { LeakyReLUParams params; err += EthosnAPI::LeakyReLU(cn->op.as()->body, ¶ms); tensor_table_[cn->args[0]] = {params.input_info}; + } else if (IsEthosnFunc(call, "ethos-n.qnn_conv2d_transpose")) { + QnnConv2dTransposeParams params; + err += EthosnAPI::QnnConv2dTranspose(cn->op.as()->body, ¶ms); + tensor_table_[cn->args[0]] = {params.input_info}; } else if (IsEthosnOp(call, "qnn.concatenate")) { ConcatenateParams params; err = EthosnAPI::Concatenate(call, ¶ms); @@ -311,6 +315,9 @@ sl::TensorsAndId ConstructNetworkVisitor::HandleCall(const CallNode* cn) { } else if (IsEthosnFunc(call, "ethos-n.qnn_leaky_relu")) { if ((err = MakeLeakyReLULayer(call, &tensor))) ReportFatalError(call, err); return MakeOps(tensor); + } else if (IsEthosnFunc(call, "ethos-n.qnn_conv2d_transpose")) { + if ((err = MakeConv2DTransposeLayer(call, &tensor))) ReportFatalError(call, err); + return MakeOps(tensor); } else if (IsEthosnOp(call, "qnn.concatenate")) { if ((err = MakeConcatenateLayer(call, &tensor))) ReportFatalError(call, err); return MakeOps(tensor); @@ -537,6 +544,24 @@ EthosnError ConstructNetworkVisitor::MakeLeakyReLULayer(const Call& call, return EthosnError(); } +EthosnError ConstructNetworkVisitor::MakeConv2DTransposeLayer(const Call& call, + sl::TensorAndId* out) { + QnnConv2dTransposeParams params; + if (auto err = EthosnAPI::QnnConv2dTranspose(call->op.as()->body, ¶ms)) { + return err; + } + + auto activation = operand_table_[call->args[0]][0]; + auto weights = AddConstant(network_, params.weights_info, params.raw_weights->data).tensor; + auto bias = AddConstant(network_, params.bias_info, params.raw_bias->data).tensor; + try { + *out = AddTransposeConvolution(network_, *activation, *bias, *weights, params.conv_info); + } catch (const sl::NotSupportedException& e) { + return EthosnError(e.what()); + } + return EthosnError(); +} + EthosnError ConstructNetworkVisitor::MakeConcatenateLayer(const Call& call, sl::TensorAndId* out) { ConcatenateParams params; @@ -913,6 +938,20 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.leaky_relu") err += EthosnError(reason); }); +TVM_REGISTER_GLOBAL("relay.ethos-n.support.conv2d_transpose") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { + Call call = args[0]; + QnnConv2dTransposeParams params; + auto err = EthosnAPI::QnnConv2dTranspose(call, ¶ms); + err += EthosnCompiler::SupportedSetup(); + char reason[kReasonMaxLength]; + reason[0] = '\0'; + *rv = !err && EthosnCompiler::GetSupported()->IsTransposeConvolutionSupported( + params.bias_info, params.weights_info, params.conv_info, params.input_info, + ¶ms.output_info, reason, sizeof(reason)); + err += EthosnError(reason); + }); + TVM_REGISTER_GLOBAL("relay.ethos-n.support.concatenate") .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { Call call = args[0]; diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h index 863a032cafba..a653b0b8dc97 100644 --- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h +++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h @@ -206,6 +206,7 @@ class ConstructNetworkVisitor : public MixedModeVisitor, private ErrorReportingP EthosnError MakeSigmoidLayer(const Call& call, sl::TensorAndId* out); EthosnError MakeMeanLayer(const Call& call, sl::TensorAndId* out); EthosnError MakeTanhLayer(const Call& call, sl::TensorAndId* out); + EthosnError MakeConv2DTransposeLayer(const Call& call, sl::TensorAndId* out); EthosnError MakeConcatenateLayer(const Call& call, sl::TensorAndId* out); EthosnError MakeSplitLayer(const Call& call, sl::TensorsAndId* outs); EthosnError MakeDepthToSpaceLayer(const Call& call, sl::TensorAndId* out); diff --git a/src/relay/backend/contrib/ethosn/convert_equivalent.cc b/src/relay/backend/contrib/ethosn/convert_equivalent.cc index 12b5a12afb35..91c924b1b04f 100644 --- a/src/relay/backend/contrib/ethosn/convert_equivalent.cc +++ b/src/relay/backend/contrib/ethosn/convert_equivalent.cc @@ -32,26 +32,13 @@ #include "../../../qnn/utils.h" #include "../../../transforms/pattern_utils.h" #include "../../../transforms/simplify_expr.h" +#include "ethosn_api.h" namespace tvm { namespace relay { namespace contrib { namespace ethosn { -/*! - * \brief Apply constant folding on an expression. - * - * \param expr The expression to fold. - * \param fold_qnn Whether to fold constants for QNN operations. - * \returns The new folded expression. - */ -Expr FoldConstantExpr(const Expr& expr, bool fold_qnn = true) { - auto mod = IRModule::FromExpr(expr); - mod = transform::FoldConstant(fold_qnn)(mod); - auto entry_func = Downcast(mod->Lookup("main")); - return expr.as() == nullptr ? entry_func->body : entry_func; -} - /*! * \brief Converts qnn.mul to mathematically equivalent * qnn.conv2d depthwise operation. diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc index 4f01c924cf6e..ce57cc23419a 100644 --- a/src/relay/backend/contrib/ethosn/ethosn_api.cc +++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc @@ -23,6 +23,7 @@ #include "ethosn_api.h" +#include #include #include #include @@ -37,6 +38,9 @@ #include #include +#include "../../../op/make_op.h" +#include "../../../transforms/pattern_utils.h" +#include "../../../transforms/simplify_expr.h" #include "ethosn_support_library/Support.hpp" #include "ethosn_support_library/SupportQueries.hpp" #include "tvm/relay/qnn/attrs.h" @@ -445,6 +449,121 @@ EthosnError EthosnAPI::Mean(const Expr& expr, MeanParams* params) { return err; } +Constant TransposeWeights(const Constant& data, const std::string& input_layout) { + int pos_h = input_layout.find("H"); + int pos_w = input_layout.find("W"); + int pos_i = input_layout.find("I"); + int pos_o = input_layout.find("O"); + + // Currently the expected target layout is HWIO only. + Array target_shape = {pos_h, pos_w, pos_i, pos_o}; + + Expr transpose = MakeTranspose(data, target_shape); + transpose = InferType(FoldConstantExpr(transpose)); + Constant transposed_data = Downcast(transpose); + return transposed_data; +} + +EthosnError EthosnAPI::QnnConv2dTranspose(const Expr& expr, QnnConv2dTransposeParams* params) { + Call requantize = Downcast(expr); + Call bias; + Call conv2d_transpose; + if (requantize->args[0]->IsInstance() && + Downcast(requantize->args[0])->op == Op::Get("nn.bias_add")) { + bias = Downcast(requantize->args[0]); + conv2d_transpose = Downcast(bias->args[0]); + } else { + conv2d_transpose = Downcast(requantize->args[0]); + } + const auto& conv_attr = conv2d_transpose->attrs.as(); + ICHECK(conv_attr) << "Expected type Conv2DTransposeAttrs but was " + << conv2d_transpose->attrs->GetTypeKey(); + + int input_zero_point; + int kernel_zero_point; + int output_zero_point; + std::valarray input_scale; + std::valarray kernel_scale; + float output_scale; + unsigned int qaxis = conv_attr->kernel_layout.find("O"); + + EthosnError err = AsConstant(conv2d_transpose->args[2], &input_zero_point); + err += AsConstant(conv2d_transpose->args[3], &kernel_zero_point); + err += AsConstant(requantize->args[4], &output_zero_point); + err += AsConstant(conv2d_transpose->args[4], &input_scale); + err += AsConstant(conv2d_transpose->args[5], &kernel_scale); + err += AsConstant(requantize->args[3], &output_scale); + + // Convert quantization params + sl::QuantizationInfo input_q_info; + sl::QuantizationInfo weights_q_info; + sl::QuantizationInfo bias_q_info; + sl::QuantizationInfo output_q_info; + err += Tvm2Npu(input_zero_point, input_scale, qaxis, &input_q_info); + err += Tvm2Npu(kernel_zero_point, kernel_scale, qaxis, &weights_q_info); + std::valarray bias_scales = input_q_info.GetScales() * weights_q_info.GetScales(); + err += Tvm2Npu(0, bias_scales, 3, &bias_q_info); + err += Tvm2Npu(output_zero_point, output_scale, &output_q_info); + + // Convert convolution attributes + sl::Padding padding; + err += Tvm2Npu(conv_attr->padding, &padding); + sl::Stride stride; + err += Tvm2Npu(conv_attr->strides, &stride); + // Dilation is not supported + std::array dilation = {1, 1}; + AsArray(conv_attr->dilation, &dilation); + if (conv_attr->dilation.size() != 2 || dilation[0] != 1 || dilation[1] != 1) { + err += + EthosnError(ErrStrm() << "dilation=" << conv_attr->dilation << ", dilation must = [1, 1]"); + } + + // Create convolution info + params->conv_info = sl::ConvolutionInfo(padding, stride, output_q_info); + + // Create input info + sl::TensorInfo input_tensor_info; + err += Tvm2Npu(conv2d_transpose->args[0]->checked_type(), &input_tensor_info); + input_tensor_info.m_QuantizationInfo = input_q_info; + params->input_info = input_tensor_info; + + // Create weights info + Constant weights_data = Downcast(conv2d_transpose->args[1]); + if (conv_attr->kernel_layout != "HWIO") { + weights_data = TransposeWeights(weights_data, conv_attr->kernel_layout); + } + const auto* weights_ttype = weights_data->checked_type().as(); + sl::TensorShape weights_tensor_shape; + sl::DataType weights_data_type; + sl::DataFormat weights_data_format; + // Ignore the error here because weights don't have a batch axis + Tvm2Npu(weights_ttype->shape, &weights_tensor_shape); + err += Tvm2Npu(weights_ttype->dtype, &weights_data_type); + err += Tvm2Npu("HWIO", &weights_data_format); + params->weights_info = + sl::TensorInfo(weights_tensor_shape, weights_data_type, weights_data_format, weights_q_info); + + params->raw_weights = weights_data->data; + + // Create bias info + unsigned int out_channels = Downcast(conv_attr->channels)->value; + params->bias_info = sl::TensorInfo({1, 1, 1, out_channels}, sl::DataType::INT32_QUANTIZED, + sl::DataFormat::NHWC, bias_q_info); + if (bias.defined()) { + params->raw_bias = Downcast(bias->args[1])->data; + } else { + params->raw_bias = MakeConstantZeros(tvm::DataType::Int(32), {1, 1, 1, out_channels})->data; + } + + // Create output info + sl::TensorInfo output_tensor_info; + err += Tvm2Npu(requantize->checked_type(), &output_tensor_info); + output_tensor_info.m_QuantizationInfo = output_q_info; + params->output_info = output_tensor_info; + + return err; +} + EthosnError EthosnAPI::Tanh(const Expr& expr, TanhParams* params) { Call quantize = Downcast(expr); Call tanh = Downcast(quantize->args[0]); @@ -925,6 +1044,13 @@ EthosnError EthosnAPI::AsConstant(const Expr& expr, T* out) { return EthosnError(); } +Expr FoldConstantExpr(const Expr& expr, bool fold_qnn) { + auto mod = IRModule::FromExpr(expr); + mod = transform::FoldConstant(fold_qnn)(mod); + auto entry_func = Downcast(mod->Lookup("main")); + return expr.as() == nullptr ? entry_func->body : entry_func; +} + } // namespace ethosn } // namespace contrib } // namespace relay diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.h b/src/relay/backend/contrib/ethosn/ethosn_api.h index afe4736bfc40..167106c3d06d 100644 --- a/src/relay/backend/contrib/ethosn/ethosn_api.h +++ b/src/relay/backend/contrib/ethosn/ethosn_api.h @@ -24,6 +24,7 @@ #ifndef TVM_RELAY_BACKEND_CONTRIB_ETHOSN_ETHOSN_API_H_ #define TVM_RELAY_BACKEND_CONTRIB_ETHOSN_ETHOSN_API_H_ +#include #include #include #include @@ -115,6 +116,16 @@ struct LeakyReLUParams { sl::TensorInfo output_info; }; +struct QnnConv2dTransposeParams { + sl::ConvolutionInfo conv_info; + sl::TensorInfo input_info; + sl::TensorInfo weights_info; + sl::TensorInfo bias_info; + sl::TensorInfo output_info; + runtime::NDArray raw_weights; + runtime::NDArray raw_bias; +}; + struct ConcatenateParams { sl::QuantizationInfo qInfo; sl::ConcatenationInfo concat_info = sl::ConcatenationInfo(1, qInfo); @@ -237,6 +248,9 @@ class EthosnAPI { static EthosnError Tanh(const Expr& expr, TanhParams* params); /*! \brief Extract the Support Library leaky relu params from an ethos-n leaky relu Relu call. */ static EthosnError LeakyReLU(const Expr& expr, LeakyReLUParams* params); + /*! \brief Extract the Support Library transpose params from a Relay + * ethos-n.qnn_conv2d_transpose func */ + static EthosnError QnnConv2dTranspose(const Expr& expr, QnnConv2dTransposeParams* params); /*! \brief Extract the Support Library concatenate params from a Relay qnn.concatenate call */ static EthosnError Concatenate(const Expr& expr, ConcatenateParams* params); /*! \brief Extract the Support Library split params from a Relay split call */ @@ -294,6 +308,15 @@ class EthosnAPI { static EthosnError AsConstant(const Expr& expr, std::valarray* out); }; +/*! + * \brief Apply constant folding on an expression. + * + * \param expr The expression to fold. + * \param fold_qnn Whether to fold constants for QNN operations. + * \returns The new folded expression. + */ +Expr FoldConstantExpr(const Expr& expr, bool fold_qnn = true); + } // namespace ethosn } // namespace contrib } // namespace relay diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py index c658b33747c3..6b019686968e 100644 --- a/tests/python/contrib/test_ethosn/infrastructure.py +++ b/tests/python/contrib/test_ethosn/infrastructure.py @@ -21,6 +21,9 @@ from hashlib import md5 from itertools import zip_longest, combinations import os +from typing import Tuple +import math + import numpy as np from PIL import Image @@ -28,6 +31,7 @@ from tvm import relay from tvm.contrib import utils, graph_executor, download from tvm.relay.op.contrib import partition_for_ethosn + from . import _infrastructure @@ -340,5 +344,44 @@ def get_conv2d_qnn_params( return output_zp, output_sc +def get_same_padding( + data: Tuple[int, int], + kernel: Tuple[int, int], + dilation: Tuple[int, int], + stride: Tuple[int, int], +) -> Tuple[int, int, int, int]: + """ + Get the padding values required for 'SAME' padding. + + Parameters + ---------- + data : Tuple[int, int] + The height and width of the data respectively. + kernel : Tuple[int, int] + The height and width of the kernel respectively. + dilation : Tuple[int, int] + The dilation of the kernel. + stride : Tuple[int, int] + The stride of the kernel. + + Returns + ------- + Tuple[int, int, int, int] + The padding values for top, left, bottom and right respectively. + """ + dilated_kernel_h = dilation[0] * (kernel[0] - 1) + 1 + dilated_kernel_w = dilation[1] * (kernel[1] - 1) + 1 + out = int(math.ceil(float(data[0]) / float(stride[0]))) + pad = max(0, (out - 1) * stride[0] + dilated_kernel_h - data[0]) + pad_top = pad // 2 + pad_bottom = pad - pad_top + + out = int(math.ceil(float(data[1]) / float(stride[1]))) + pad = max(0, (out - 1) * stride[1] + dilated_kernel_w - data[1]) + pad_left = pad // 2 + pad_right = pad - pad_left + return (pad_top, pad_left, pad_bottom, pad_right) + + def get_ethosn_variant(): return os.getenv("ETHOSN_VARIANT_CONFIG", default="Ethos-N78_1TOPS_2PLE_RATIO") diff --git a/tests/python/contrib/test_ethosn/test_conv2d.py b/tests/python/contrib/test_ethosn/test_conv2d.py index 4026f8267d72..a6ce73656bfc 100644 --- a/tests/python/contrib/test_ethosn/test_conv2d.py +++ b/tests/python/contrib/test_ethosn/test_conv2d.py @@ -17,8 +17,6 @@ """Arm(R) Ethos(TM)-N integration conv2d tests""" -import math - import numpy as np import pytest @@ -29,21 +27,6 @@ from . import infrastructure as tei -def _get_same_padding(data, kernel, dilation, stride): - dilated_kernel_h = dilation[0] * (kernel[0] - 1) + 1 - dilated_kernel_w = dilation[1] * (kernel[1] - 1) + 1 - out = int(math.ceil(float(data[0]) / float(stride[0]))) - pad = max(0, (out - 1) * stride[0] + dilated_kernel_h - data[0]) - pad_top = pad // 2 - pad_bottom = pad - pad_top - - out = int(math.ceil(float(data[1]) / float(stride[1]))) - pad = max(0, (out - 1) * stride[1] + dilated_kernel_w - data[1]) - pad_left = pad // 2 - pad_right = pad - pad_left - return [pad_top, pad_left, pad_bottom, pad_right] - - def _get_model( shape, kernel_h, @@ -65,7 +48,7 @@ def _get_model( """Return a model and any parameters it may have""" a = relay.var("a", shape=shape, dtype=dtype) if pad in ("op", "both"): - p = _get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides) + p = tei.get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides) a = relay.nn.pad( a, pad_width=[(0, 0), (p[0], p[2]), (p[1], p[3]), (0, 0)], @@ -74,7 +57,7 @@ def _get_model( ) shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3]) - p = _get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides) + p = tei.get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides) if weight_format == "HWIO": weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels) else: diff --git a/tests/python/contrib/test_ethosn/test_conv2d_transpose.py b/tests/python/contrib/test_ethosn/test_conv2d_transpose.py new file mode 100644 index 000000000000..84aa7e969b30 --- /dev/null +++ b/tests/python/contrib/test_ethosn/test_conv2d_transpose.py @@ -0,0 +1,234 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Arm(R) Ethos(TM)-N integration conv2d tests""" + +import pytest +import numpy as np + +import tvm +from tvm import relay +from tvm.testing import requires_ethosn +from . import infrastructure as tei + + +def _get_model( + shape, + kernel_h, + kernel_w, + input_zp, + input_sc, + kernel_zp, + kernel_sc, + output_zp, + output_sc, + stride, + dilation, + groups, + kernel_layout, + dtype, + out_channels, + bias, +): + """Return a model and any parameters it may have""" + a = relay.var("a", shape=shape, dtype=dtype) + p = tei.get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, stride) + weight_shape = (shape[3], out_channels // groups, kernel_h, kernel_w) + + weight_data = tvm.nd.array( + np.random.randint( + np.iinfo(dtype).min, + high=(np.iinfo(dtype).max + 1), + size=weight_shape, + dtype=dtype, + ) + ) + weights = relay.const(weight_data, dtype) + op = relay.qnn.op.conv2d_transpose( + a, + weights, + input_zero_point=relay.const(input_zp, "int32"), + input_scale=relay.const(input_sc, "float32"), + kernel_zero_point=relay.const(kernel_zp, "int32"), + kernel_scale=relay.const(kernel_sc, "float32"), + kernel_size=(kernel_h, kernel_w), + padding=p, + strides=stride, + dilation=dilation, + data_layout="NHWC", + kernel_layout=kernel_layout, + out_dtype="int32", + channels=out_channels, + groups=groups, + ) + if bias: + bias_data = tvm.nd.array( + np.random.randint( + np.iinfo(dtype).min, + high=np.iinfo(dtype).max + 1, + size=(out_channels,), + dtype="int32", + ) + ) + biasc = relay.const(bias_data, "int32") + op = relay.nn.bias_add(op, biasc, axis=3) + + if isinstance(kernel_sc, tvm.runtime.ndarray.NDArray): + req_input_sc = [sc * input_sc for sc in kernel_sc.numpy()] + else: + req_input_sc = input_sc * kernel_sc + + op = relay.qnn.op.requantize( + op, + input_zero_point=relay.const(input_zp, "int32"), + input_scale=relay.const(req_input_sc, "float32"), + output_zero_point=relay.const(output_zp, "int32"), + output_scale=relay.const(output_sc, "float32"), + axis=3, + rounding="UPWARD", + out_dtype=dtype, + ) + params = {"w": weight_data} + if bias: + params["b"] = bias_data + return op, params + + +@requires_ethosn +@pytest.mark.parametrize("dtype", ["uint8", "int8"]) +@pytest.mark.parametrize( + "ifm_shape,strides,kernel_size,out_channels,bias", + [ + ((1, 2, 2, 1), (2, 2), (1, 1), 1, False), + ((1, 2, 2, 5), (2, 2), (3, 5), 4, False), + ((1, 7, 7, 4), (2, 2), (7, 9), 8, True), + ], +) +def test_conv2d_transpose(ifm_shape, strides, kernel_size, out_channels, dtype, bias): + """Check transpose convolution output with TVM.""" + np.random.seed(0) + + kernel_layout = "IOHW" + dilation = (1, 1) + groups = 1 + + iinfo = np.iinfo(dtype) + data_min = iinfo.min + data_max = iinfo.max + + input_zp = np.random.randint(data_min, data_max) + input_sc = np.random.random() * 2 + kernel_zp = np.random.randint(data_min, data_max) + kernel_sc = np.random.random() * 4 + output_zp, output_sc = tei.get_conv2d_qnn_params( + dtype, input_zp, input_sc, kernel_zp, kernel_sc, ifm_shape[1], ifm_shape[2], ifm_shape[3] + ) + + model, params = _get_model( + shape=ifm_shape, + kernel_h=kernel_size[0], + kernel_w=kernel_size[1], + input_zp=input_zp, + input_sc=input_sc, + kernel_zp=kernel_zp, + kernel_sc=kernel_sc, + output_zp=output_zp, + output_sc=output_sc, + stride=strides, + dilation=dilation, + groups=groups, + kernel_layout=kernel_layout, + dtype=dtype, + out_channels=out_channels, + bias=bias, + ) + + outputs = [] + inputs = { + "a": tvm.nd.array(np.random.randint(data_min, data_max + 1, size=ifm_shape, dtype=dtype)) + } + + for npu in [False, True]: + mod = tei.make_module(model, params) + outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu)) + + tei.verify(outputs, dtype, 1) + + +@requires_ethosn +@pytest.mark.parametrize("dtype", ["uint8", "int8"]) +@pytest.mark.parametrize( + "shape, stride, dilation, groups, err_msg", + [ + ( + (1, 4, 4, 4), + (1, 1, 1), + (1, 1), + 1, + "stride size=3, stride size must = 2", + ), + ( + (1, 4, 4, 4), + (2, 2), + (2, 2), + 2, + "dilation=[2, 2], dilation must = [1, 1]", + ), + ( + (2, 4, 4, 4), + (1, 1), + (1, 1), + 1, + "batch size=2, batch size must = 1", + ), + ], +) +def test_conv2d_transpose_failure( + shape, + stride, + dilation, + groups, + err_msg, + dtype, +): + """ + Test transpose_conv2d error messages. + """ + np.random.seed(0) + out_channels = 8 + + model, _ = _get_model( + shape=shape, + kernel_h=1, + kernel_w=1, + input_zp=0, + input_sc=1, + kernel_zp=0, + kernel_sc=1, + output_zp=0, + output_sc=1, + stride=stride, + dilation=dilation, + groups=groups, + kernel_layout="IOHW", + dtype=dtype, + out_channels=out_channels, + bias=False, + ) + model = tei.make_ethosn_composite(model, "ethos-n.qnn_conv2d_transpose") + mod = tei.make_ethosn_partition(model) + tei.test_error(mod, {}, err_msg) From ff9a5309ecd713214a61e9e848c90289831f70c5 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Wed, 7 Sep 2022 07:26:09 -0700 Subject: [PATCH 116/704] [microTVM][Zephyr] Enable -O2 optimization on build by default (#12718) * add spped optimization flag * trigger * add exception for qemu_riscv64 --- .../zephyr/template_project/microtvm_api_server.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py index 76895c430bd6..b73779f68148 100644 --- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py +++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py @@ -456,6 +456,7 @@ def server_info_query(self, tvm_version): } def _create_prj_conf(self, project_dir, options): + zephyr_board = options["zephyr_board"] with open(project_dir / "prj.conf", "w") as f: f.write( "# For UART used from main().\n" @@ -477,7 +478,7 @@ def _create_prj_conf(self, project_dir, options): f.write("# For math routines\n" "CONFIG_NEWLIB_LIBC=y\n" "\n") - if self._has_fpu(options["zephyr_board"]): + if self._has_fpu(zephyr_board): f.write("# For models with floating point.\n" "CONFIG_FPU=y\n" "\n") # Set main stack size, if needed. @@ -488,9 +489,13 @@ def _create_prj_conf(self, project_dir, options): f.write("\n# Extra prj.conf directives\n") for line, board_list in self.EXTRA_PRJ_CONF_DIRECTIVES.items(): - if options["zephyr_board"] in board_list: + if zephyr_board in board_list: f.write(f"{line}\n") + # TODO(mehrdadh): due to https://github.com/apache/tvm/issues/12721 + if zephyr_board not in ["qemu_riscv64"]: + f.write("# For setting -O2 in compiler.\n" "CONFIG_SPEED_OPTIMIZATIONS=y\n") + f.write("\n") API_SERVER_CRT_LIBS_TOKEN = "" From 269d536be0308f6594b22615d33cc0f0539ad39a Mon Sep 17 00:00:00 2001 From: Aakanksha Verma <89928182+avquicinc@users.noreply.github.com> Date: Wed, 7 Sep 2022 19:59:54 +0530 Subject: [PATCH 117/704] [HEXAGON] [TOPI] Dequantize (#12677) dequantize op hexagon --- python/tvm/topi/hexagon/qnn/__init__.py | 5 + python/tvm/topi/hexagon/qnn/dequantize.py | 94 ++++++++++++++ python/tvm/topi/hexagon/utils.py | 7 + .../contrib/test_hexagon/infrastructure.py | 2 + .../topi/test_dequantize_slice.py | 121 ++++++++++++++++++ 5 files changed, 229 insertions(+) create mode 100644 python/tvm/topi/hexagon/qnn/dequantize.py create mode 100644 tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py index e27e3793d565..25d1e6d1854d 100644 --- a/python/tvm/topi/hexagon/qnn/__init__.py +++ b/python/tvm/topi/hexagon/qnn/__init__.py @@ -18,3 +18,8 @@ """ Computes and schedules for Hexagon quantized ops """ from .avg_pool2d import qnn_avg_pool2d_compute, qnn_avg_pool2d_schedule + +from .dequantize import ( + dequantize_compute, + dequantize_schedule, +) diff --git a/python/tvm/topi/hexagon/qnn/dequantize.py b/python/tvm/topi/hexagon/qnn/dequantize.py new file mode 100644 index 000000000000..3e1466e88b38 --- /dev/null +++ b/python/tvm/topi/hexagon/qnn/dequantize.py @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name + +""" Hexagon qnn.dequantize slice op compute and schedule""" + +from tvm import te +from tvm import tir +from ..utils import get_layout_transform_fn + + +def dequantize_compute(tensor_A, scale_A, zero_point_A): + + return te.compute( + tensor_A.shape, + lambda *indices: (scale_A * (tensor_A[indices] - zero_point_A)).astype("float32"), + name="dequantize", + ) + + +def dequantize_stir_schedule_nhwc_8h8w32c( + _in, + _out, + in_layout, + out_layout, +): + """Schedule for nhwc int8/uint8 to f32 : nhwc layout""" + func = te.create_prim_func([_in, _out]) + sch = tir.Schedule(func, debug_mask="all") + block_name = "dequantize" + n, h, w, c = sch.get_loops(sch.get_block(block_name)) + ho, hi = sch.split(h, [None, 4]) + wo, wi = sch.split(w, [None, 8]) + wio, wii = sch.split(wi, [None, 4]) + co, ci = sch.split(c, [None, 32]) + sch.transform_layout(block_name, "A", in_layout) + sch.transform_layout(block_name, block_name, out_layout) + sch.reorder(n, ho, wo, co, hi, wio, wii, ci) + wii_ci = sch.fuse(wii, ci) + sch.vectorize(wii_ci) + return sch + + +def dequantize_stir_schedule_nc( + _in, + _out, + in_layout, + out_layout, +): + """Schedule for nc int8/uint8 to f32 : nc layout""" + func = te.create_prim_func([_in, _out]) + sch = tir.Schedule(func, debug_mask="all") + block_name = "dequantize" + _, c_orig = sch.get_loops(sch.get_block(block_name)) + _, c_inner = sch.split(c_orig, [None, 512]) + sch.transform_layout(block_name, "A", in_layout) + sch.transform_layout(block_name, block_name, out_layout) + sch.vectorize(c_inner) + return sch + + +def dequantize_schedule(_in, _output, in_layout_str, out_layout_str): + """Schedule for int8/uint8 to f32 : top level function""" + f32_layout_transform_func = get_layout_transform_fn(out_layout_str) + in_layout_transform_func = get_layout_transform_fn(in_layout_str) + if out_layout_str == "nhwc-4h2w32c2w-2d": + return dequantize_stir_schedule_nhwc_8h8w32c( + _in, + _output, + in_layout_transform_func, + f32_layout_transform_func, + ) + if out_layout_str == "nc-512c-2d": + return dequantize_stir_schedule_nc( + _in, + _output, + in_layout_transform_func, + f32_layout_transform_func, + ) + raise RuntimeError(f"Unexpected layout '{layout}'") diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py index c056408947b7..9939e5b6fbb7 100644 --- a/python/tvm/topi/hexagon/utils.py +++ b/python/tvm/topi/hexagon/utils.py @@ -100,6 +100,11 @@ def nc_2048_2d(n, c): return [n, c // 2048, te.AXIS_SEPARATOR, c % 2048] +def nc_2048c_2d(n, c): + """Return index map for nc_2048 2d layout""" + return [n, c // 2048, te.AXIS_SEPARATOR, c % 2048] + + def nhwc_8h8w32c_2d(n, h, w, c): """Return index map for nhwc_8h8w32c 2d layout""" return [n, h // 8, w // 8, c // 32, te.AXIS_SEPARATOR, h % 8, w % 8, c % 32] @@ -156,6 +161,8 @@ def get_layout_transform_fn(layout): return nhwc_2048c_2d if layout == "nc-2048-2d": return nc_2048_2d + if layout == "nc-2048c-2d": + return nc_2048c_2d if layout == "nhwc-8h8w32c-2d": return nhwc_8h8w32c_2d if layout == "n11c-2048c-2d": diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py index 70e50fcb68d6..71960b649ea2 100644 --- a/tests/python/contrib/test_hexagon/infrastructure.py +++ b/tests/python/contrib/test_hexagon/infrastructure.py @@ -295,6 +295,8 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str): return arr_np.reshape([n, c // 1024, 1024]) if new_layout in ["nc-512c-2d"]: return arr_np.reshape([n, c // 512, 512]) + if new_layout in ["nc-2048c-2d"]: + return arr_np.reshape([n, c // 2048, 2048]) raise RuntimeError(f"Unexpected new_layout '{new_layout}'") if current_layout == "nhw": diff --git a/tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py b/tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py new file mode 100644 index 000000000000..e9b3dd132692 --- /dev/null +++ b/tests/python/contrib/test_hexagon/topi/test_dequantize_slice.py @@ -0,0 +1,121 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name + +""" Tests for Hexagon dequantize """ +import numpy as np + +import tvm +import tvm.testing +from tvm import te +from tvm.topi.hexagon import qnn +from ..infrastructure import allocate_hexagon_array, transform_numpy, quantize_np + + +class TestDequantizeSlice2d: + """ + For testing Dequantize Slice ops + """ + + input_shape, orig_layout, input_layout, output_layout, axis_sep, dtype = tvm.testing.parameters( + ((1, 16, 64, 128), "nhwc", "nhwc-8h8w32c-2d", "nhwc-4h2w32c2w-2d", [4], "int8"), + ((1, 16, 64, 128), "nhwc", "nhwc-8h8w32c-2d", "nhwc-4h2w32c2w-2d", [4], "uint8"), + ((1, 8, 8, 32), "nhwc", "nhwc-8h8w32c-2d", "nhwc-4h2w32c2w-2d", [4], "int8"), + ((1, 8, 8, 32), "nhwc", "nhwc-8h8w32c-2d", "nhwc-4h2w32c2w-2d", [4], "uint8"), + ((1, 2048), "nc", "nc-2048c-2d", "nc-512c-2d", [2], "int8"), + ((1, 2048), "nc", "nc-2048c-2d", "nc-512c-2d", [2], "uint8"), + ) + + working_scope = tvm.testing.parameter("global.vtcm") + + @tvm.testing.fixture + def input_np(self, input_shape): + arr_np = np.random.random(size=input_shape).astype("float32") + return arr_np + + @tvm.testing.fixture + def transformed_input_np(self, input_np, orig_layout, input_layout, dtype): + quant_arr, scale, zero_point = quantize_np(input_np, dtype) + return [transform_numpy(quant_arr, orig_layout, input_layout), scale, zero_point] + + @tvm.testing.fixture + def expected_output_np(self, input_np, dtype): + quant_np, scale, zero_point = quantize_np(input_np, dtype) + ref_np = (scale * (quant_np.astype("int32") - zero_point)).astype("float32") + return ref_np + + @tvm.testing.fixture + def transformed_expected_output_np(self, expected_output_np, orig_layout, output_layout): + return transform_numpy(expected_output_np, orig_layout, output_layout) + + @tvm.testing.requires_hexagon + def test_dequant_qnn( + self, + input_shape, + dtype, + input_layout, + output_layout, + transformed_input_np, + transformed_expected_output_np, + axis_sep, + hexagon_session, + working_scope, + ): + """ + Top level testing function for dequantize + """ + target_hexagon = tvm.target.hexagon("v69") + target = tvm.target.Target(target_hexagon, host=target_hexagon) + + dequant_input = te.placeholder(input_shape, name="A", dtype=dtype) + + in_data_np, in_scale, in_zero_pt = transformed_input_np + + dequant_output = qnn.dequantize_compute(dequant_input, in_scale, in_zero_pt) + + tir_s = qnn.dequantize_schedule(dequant_input, dequant_output, input_layout, output_layout) + + input_data = allocate_hexagon_array( + hexagon_session.device, + data=in_data_np, + axis_separators=axis_sep, + mem_scope=working_scope, + ) + output_data = allocate_hexagon_array( + hexagon_session.device, + tensor_shape=transformed_expected_output_np.shape, + dtype=transformed_expected_output_np.dtype, + axis_separators=axis_sep, + mem_scope=working_scope, + ) + with tvm.transform.PassContext(opt_level=3): + tir_irm = tvm.lower(tir_s.mod, [dequant_input, dequant_output], name="dequantize") + runtime_module = tvm.build(tir_irm, target=target, name="dequantize") + mod = hexagon_session.load_module(runtime_module) + + mod(input_data, output_data) + output_np = output_data.numpy() + tvm.testing.assert_allclose( + output_np, + transformed_expected_output_np, + 1e-3, + 1e-3, + ) + + +if __name__ == "__main__": + tvm.testing.main() From 2622ac9e638b259cae017813ad93937c0ff8a2f9 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Wed, 7 Sep 2022 09:12:02 -0700 Subject: [PATCH 118/704] [Build] Update C++ standard to C++17 for AOT, iOS, VTA (#12712) Follow-up from https://github.com/apache/tvm/pull/12337 and https://github.com/apache/tvm/pull/12693, updating a few additional locations that specified C++14. --- apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj | 4 ++-- include/tvm/support/span.h | 2 +- tests/python/relay/aot/test_cpp_aot.py | 2 +- vta/python/vta/exec/rpc_server.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj index 61427d0ca248..ccc61707d3f2 100644 --- a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj +++ b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj @@ -255,7 +255,7 @@ ALWAYS_SEARCH_USER_PATHS = NO; CLANG_ANALYZER_NONNULL = YES; CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++17"; CLANG_CXX_LIBRARY = "libc++"; CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_OBJC_ARC = YES; @@ -308,7 +308,7 @@ ALWAYS_SEARCH_USER_PATHS = NO; CLANG_ANALYZER_NONNULL = YES; CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++14"; + CLANG_CXX_LANGUAGE_STANDARD = "gnu++17"; CLANG_CXX_LIBRARY = "libc++"; CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_OBJC_ARC = YES; diff --git a/include/tvm/support/span.h b/include/tvm/support/span.h index 689a48dee788..768252f77ce9 100644 --- a/include/tvm/support/span.h +++ b/include/tvm/support/span.h @@ -36,7 +36,7 @@ namespace support { /*! * \brief A partial implementation of the C++20 std::span. * - * At the time of writing, TVM must compile against C++14. + * At the time of writing, TVM must compile against C++17. */ template class Span { diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py index 4ffe302763f8..b67bc90d34fd 100644 --- a/tests/python/relay/aot/test_cpp_aot.py +++ b/tests/python/relay/aot/test_cpp_aot.py @@ -138,7 +138,7 @@ def test_mobilenet(enable_usmp, target_kind): temp_dir = tvm.contrib.utils.TempDirectory() test_so_path = temp_dir / "test.so" - mod.export_library(test_so_path, cc="c++", options=["-std=gnu++14", "-g3", "-O0"]) + mod.export_library(test_so_path, cc="c++", options=["-std=gnu++17", "-g3", "-O0"]) loaded_mod = tvm.runtime.load_module(test_so_path) runner = tvm.runtime.executor.AotModule(loaded_mod["default"](tvm.cpu(0))) runner.set_input(**inputs) diff --git a/vta/python/vta/exec/rpc_server.py b/vta/python/vta/exec/rpc_server.py index dcf564dd0314..1abad98b2216 100644 --- a/vta/python/vta/exec/rpc_server.py +++ b/vta/python/vta/exec/rpc_server.py @@ -106,7 +106,7 @@ def reconfig_runtime(cfg_json): if pkg.same_config(old_cfg): logging.info("Skip reconfig_runtime due to same config.") return - cflags = ["-O2", "-std=c++14"] + cflags = ["-O2", "-std=c++17"] cflags += pkg.cflags ldflags = pkg.ldflags lib_name = dll_path From 010c662938245d607fbffd4bd10a9c7fb93e4270 Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Wed, 7 Sep 2022 12:17:59 -0700 Subject: [PATCH 119/704] [TVMScript] IRBuilder methods for `IRModule` (#12694) * IRBuilder methods for `IRModule` This PR introduces IRBuilder methods for `IRModule`. Co-authored-by: yongwww * apply code review suggestion Co-authored-by: yongwww --- include/tvm/script/ir_builder/ir/frame.h | 71 +++++++++++++++++++ include/tvm/script/ir_builder/ir/ir.h | 43 +++++++++++ python/tvm/script/ir_builder/ir/__init__.py | 19 +++++ python/tvm/script/ir_builder/ir/_ffi_api.py | 20 ++++++ python/tvm/script/ir_builder/ir/frame.py | 26 +++++++ python/tvm/script/ir_builder/ir/ir.py | 24 +++++++ src/script/ir_builder/ir/frame.cc | 43 +++++++++++ src/script/ir_builder/ir/ir.cc | 38 ++++++++++ .../test_tvmscript_ir_builder_irmodule.py | 41 +++++++++++ 9 files changed, 325 insertions(+) create mode 100644 include/tvm/script/ir_builder/ir/frame.h create mode 100644 include/tvm/script/ir_builder/ir/ir.h create mode 100644 python/tvm/script/ir_builder/ir/__init__.py create mode 100644 python/tvm/script/ir_builder/ir/_ffi_api.py create mode 100644 python/tvm/script/ir_builder/ir/frame.py create mode 100644 python/tvm/script/ir_builder/ir/ir.py create mode 100644 src/script/ir_builder/ir/frame.cc create mode 100644 src/script/ir_builder/ir/ir.cc create mode 100644 tests/python/unittest/test_tvmscript_ir_builder_irmodule.py diff --git a/include/tvm/script/ir_builder/ir/frame.h b/include/tvm/script/ir_builder/ir/frame.h new file mode 100644 index 000000000000..181774bc53bc --- /dev/null +++ b/include/tvm/script/ir_builder/ir/frame.h @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_SCRIPT_IR_BUILDER_IR_FRAME_H_ +#define TVM_SCRIPT_IR_BUILDER_IR_FRAME_H_ + +#include +#include +#include +#include + +#include + +namespace tvm { +namespace script { +namespace ir_builder { + +/*! + * \brief A frame that represents the IRModule frame with functions and global variables. + * + * \sa IRModuleFrame + */ +class IRModuleFrameNode : public IRBuilderFrameNode { + public: + Array global_vars; + Array functions; + + void VisitAttrs(tvm::AttrVisitor* v) { + IRBuilderFrameNode::VisitAttrs(v); + v->Visit("global_vars", &global_vars); + v->Visit("functions", &functions); + } + + static constexpr const char* _type_key = "script.ir_builder.IRModuleFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(IRModuleFrameNode, IRBuilderFrameNode); + + public: + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to IRModuleFrameNode. + * + * \sa IRModuleFrameNode + */ +class IRModuleFrame : public IRBuilderFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(IRModuleFrame, IRBuilderFrame, + IRModuleFrameNode); +}; + +} // namespace ir_builder +} // namespace script +} // namespace tvm + +#endif // TVM_SCRIPT_IR_BUILDER_IR_FRAME_H_ diff --git a/include/tvm/script/ir_builder/ir/ir.h b/include/tvm/script/ir_builder/ir/ir.h new file mode 100644 index 000000000000..0bd5473c7eaf --- /dev/null +++ b/include/tvm/script/ir_builder/ir/ir.h @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_SCRIPT_IR_BUILDER_IR_IR_H_ +#define TVM_SCRIPT_IR_BUILDER_IR_IR_H_ + +#include +#include +#include +#include + +#include + +namespace tvm { +namespace script { +namespace ir_builder { + +/*! + * \brief The IRModule declaration statement. + * \return The IRModuleFrame. + */ +TVM_DLL IRModuleFrame IRModule(); + +} // namespace ir_builder +} // namespace script +} // namespace tvm + +#endif // TVM_SCRIPT_IR_BUILDER_IR_IR_H_ diff --git a/python/tvm/script/ir_builder/ir/__init__.py b/python/tvm/script/ir_builder/ir/__init__.py new file mode 100644 index 000000000000..ebb9728737ad --- /dev/null +++ b/python/tvm/script/ir_builder/ir/__init__.py @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Package tvm.script.ir_builder.ir""" +from .frame import IRModuleFrame +from .ir import ir_module diff --git a/python/tvm/script/ir_builder/ir/_ffi_api.py b/python/tvm/script/ir_builder/ir/_ffi_api.py new file mode 100644 index 000000000000..874cc278af83 --- /dev/null +++ b/python/tvm/script/ir_builder/ir/_ffi_api.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""FFI APIs""" +import tvm._ffi + +tvm._ffi._init_api("script.ir_builder.ir", __name__) # pylint: disable=protected-access diff --git a/python/tvm/script/ir_builder/ir/frame.py b/python/tvm/script/ir_builder/ir/frame.py new file mode 100644 index 000000000000..e16d86dc227e --- /dev/null +++ b/python/tvm/script/ir_builder/ir/frame.py @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Package tvm.script.ir_builder.ir.frame""" + +from tvm._ffi import register_object as _register_object + +from ..base import IRBuilderFrame + + +@_register_object("script.ir_builder.IRModuleFrame") +class IRModuleFrame(IRBuilderFrame): + ... diff --git a/python/tvm/script/ir_builder/ir/ir.py b/python/tvm/script/ir_builder/ir/ir.py new file mode 100644 index 000000000000..df920364356b --- /dev/null +++ b/python/tvm/script/ir_builder/ir/ir.py @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Package tvm.script.ir_builder.ir.ir""" + +from . import _ffi_api +from .frame import IRModuleFrame + + +def ir_module() -> IRModuleFrame: + return _ffi_api.IRModule() # pylint: disable=no-member # type: ignore diff --git a/src/script/ir_builder/ir/frame.cc b/src/script/ir_builder/ir/frame.cc new file mode 100644 index 000000000000..c85e30544aca --- /dev/null +++ b/src/script/ir_builder/ir/frame.cc @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include +#include +#include + +namespace tvm { +namespace script { +namespace ir_builder { + +void IRModuleFrameNode::ExitWithScope() { + ICHECK_EQ(functions.size(), global_vars.size()); + int n = functions.size(); + Map func_map; + for (int i = 0; i < n; ++i) { + func_map.Set(global_vars[i], functions[i]); + } + IRBuilder builder = IRBuilder::Current(); + ICHECK(!builder->result.defined()) << "ValueError: Builder.result has already been set"; + builder->result = tvm::IRModule(func_map); +} + +TVM_REGISTER_NODE_TYPE(IRModuleFrameNode); + +} // namespace ir_builder +} // namespace script +} // namespace tvm diff --git a/src/script/ir_builder/ir/ir.cc b/src/script/ir_builder/ir/ir.cc new file mode 100644 index 000000000000..bcd21de144bb --- /dev/null +++ b/src/script/ir_builder/ir/ir.cc @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include +#include +#include + +namespace tvm { +namespace script { +namespace ir_builder { + +IRModuleFrame IRModule() { + ObjectPtr n = make_object(); + n->global_vars.clear(); + n->functions.clear(); + return IRModuleFrame(n); +} + +TVM_REGISTER_GLOBAL("script.ir_builder.ir.IRModule").set_body_typed(IRModule); + +} // namespace ir_builder +} // namespace script +} // namespace tvm diff --git a/tests/python/unittest/test_tvmscript_ir_builder_irmodule.py b/tests/python/unittest/test_tvmscript_ir_builder_irmodule.py new file mode 100644 index 000000000000..7adf192df36b --- /dev/null +++ b/tests/python/unittest/test_tvmscript_ir_builder_irmodule.py @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Unittests for tvm.script.ir_builder.ir""" +import pytest +import tvm.testing +from tvm.script.ir_builder import IRBuilder +from tvm.script.ir_builder import ir as I +from tvm import ir +from tvm.ir.base import assert_structural_equal + + +def test_ir_builder_irmodule(): + with IRBuilder() as ib: # pylint: disable=invalid-name + with I.ir_module(): + pass + + # the ir_module generated by IRBuilder + ir_module_actual = ib.get() + + # the expected prim_func + ir_module_expected = ir.IRModule(None, None) + + assert_structural_equal(ir_module_actual, ir_module_expected, map_free_vars=True) + + +if __name__ == "__main__": + tvm.testing.main() From bee562790894ee195bd934740a30dabfbb2f5483 Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Wed, 7 Sep 2022 20:53:54 +0100 Subject: [PATCH 120/704] [TFLite][CI] Update TensorFlow dependency to 2.9.1 (#12131) This updates the TF version to be used in TVM CI to 2.9.1, which brings improvements so that more platforms are supported by official packages. When building TFLite, an update to CMake was also required, which is updated now to 3.18.4. ethos-u-vela dependency is also updated, from version 3.2.0 to 3.4.0 so that it is closer to the TensorFlow version being proposed here. This PR updates the Docker images scripting to install TF and TFLite. Change-Id: I290085f0c018ad57606f1295494c19ff6e1af2dd --- cmake/modules/contrib/TFLite.cmake | 2 ++ docker/Dockerfile.ci_cortexm | 3 +++ docker/Dockerfile.ci_cpu | 3 +++ docker/Dockerfile.ci_gpu | 3 +++ docker/Dockerfile.ci_riscv | 3 +++ docker/install/ubuntu_install_cmake_source.sh | 4 ++-- .../install/ubuntu_install_python_package.sh | 2 +- docker/install/ubuntu_install_tensorflow.sh | 5 ++-- .../ubuntu_install_tensorflow_aarch64.sh | 23 ++----------------- docker/install/ubuntu_install_tflite.sh | 13 +++++++++-- docker/install/ubuntu_install_vela.sh | 2 +- docker/install/ubuntu_install_zephyr.sh | 3 ++- 12 files changed, 35 insertions(+), 31 deletions(-) diff --git a/cmake/modules/contrib/TFLite.cmake b/cmake/modules/contrib/TFLite.cmake index 31597109095a..b8d6a0daff19 100644 --- a/cmake/modules/contrib/TFLite.cmake +++ b/cmake/modules/contrib/TFLite.cmake @@ -38,8 +38,10 @@ if(NOT USE_TFLITE STREQUAL "OFF") set(USE_TFLITE ${USE_TENSORFLOW_PATH}/tensorflow/lite/tools/make/gen/*/lib) endif() find_library(TFLITE_CONTRIB_LIB libtensorflow-lite.a ${USE_TFLITE}) + file(GLOB_RECURSE TFLITE_DEPS "${USE_TFLITE}/*.a") list(APPEND TVM_RUNTIME_LINKER_LIBS ${TFLITE_CONTRIB_LIB}) + list(APPEND TVM_RUNTIME_LINKER_LIBS ${TFLITE_DEPS}) if (NOT USE_FLATBUFFERS_PATH STREQUAL "none") include_directories(${USE_FLATBUFFERS_PATH}/include) diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm index 63089f3d65f2..fb3c10d393f0 100644 --- a/docker/Dockerfile.ci_cortexm +++ b/docker/Dockerfile.ci_cortexm @@ -32,6 +32,9 @@ RUN bash /install/ubuntu_install_googletest.sh COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh RUN bash /install/ubuntu1804_install_python.sh +COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh +RUN bash /install/ubuntu_install_cmake_source.sh + COPY install/ubuntu1804_install_python_venv.sh /install/ubuntu1804_install_python_venv.sh RUN bash /install/ubuntu1804_install_python_venv.sh ENV PATH=/opt/tvm-venv/bin:/opt/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin:$PATH diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu index 3812bfbd197e..d9f353d41be1 100644 --- a/docker/Dockerfile.ci_cpu +++ b/docker/Dockerfile.ci_cpu @@ -40,6 +40,9 @@ RUN bash /install/ubuntu_install_python_package.sh COPY install/ubuntu1804_install_llvm.sh /install/ubuntu1804_install_llvm.sh RUN bash /install/ubuntu1804_install_llvm.sh +COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh +RUN bash /install/ubuntu_install_cmake_source.sh + COPY install/ubuntu_install_dnnl.sh /install/ubuntu_install_dnnl.sh RUN bash /install/ubuntu_install_dnnl.sh diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu index f04d8515b8dc..6f02ab97c09e 100644 --- a/docker/Dockerfile.ci_gpu +++ b/docker/Dockerfile.ci_gpu @@ -32,6 +32,9 @@ RUN apt-get update --fix-missing COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh RUN bash /install/ubuntu_install_core.sh +COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh +RUN bash /install/ubuntu_install_cmake_source.sh + COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh RUN bash /install/ubuntu_install_googletest.sh diff --git a/docker/Dockerfile.ci_riscv b/docker/Dockerfile.ci_riscv index b65b87a86386..1ca792e20c98 100644 --- a/docker/Dockerfile.ci_riscv +++ b/docker/Dockerfile.ci_riscv @@ -32,6 +32,9 @@ RUN bash /install/ubuntu_install_googletest.sh COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh RUN bash /install/ubuntu1804_install_python.sh +COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh +RUN bash /install/ubuntu_install_cmake_source.sh + COPY install/ubuntu1804_install_python_venv.sh /install/ubuntu1804_install_python_venv.sh RUN bash /install/ubuntu1804_install_python_venv.sh ENV PATH=/opt/tvm-venv/bin:/opt/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin:$PATH diff --git a/docker/install/ubuntu_install_cmake_source.sh b/docker/install/ubuntu_install_cmake_source.sh index 18335c98c403..030cb4ea0406 100755 --- a/docker/install/ubuntu_install_cmake_source.sh +++ b/docker/install/ubuntu_install_cmake_source.sh @@ -20,8 +20,8 @@ set -e set -u set -o pipefail -v=3.14 -version=3.14.7 +v=3.18 +version=3.18.4 wget https://cmake.org/files/v${v}/cmake-${version}.tar.gz tar xvf cmake-${version}.tar.gz cd cmake-${version} diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh index 3fc310c47e34..9fee9d01425c 100755 --- a/docker/install/ubuntu_install_python_package.sh +++ b/docker/install/ubuntu_install_python_package.sh @@ -28,7 +28,7 @@ pip3 install --upgrade \ cython \ decorator \ mypy \ - numpy~=1.19.5 \ + numpy==1.21.* \ orderedset \ packaging \ Pillow==9.1.0 \ diff --git a/docker/install/ubuntu_install_tensorflow.sh b/docker/install/ubuntu_install_tensorflow.sh index 17d2b31d9bc2..2225b7aef3b8 100755 --- a/docker/install/ubuntu_install_tensorflow.sh +++ b/docker/install/ubuntu_install_tensorflow.sh @@ -21,6 +21,5 @@ set -u set -o pipefail pip3 install \ - "h5py==3.1.0" \ - keras==2.6 \ - tensorflow==2.6.5 + keras==2.9 \ + tensorflow==2.9.1 diff --git a/docker/install/ubuntu_install_tensorflow_aarch64.sh b/docker/install/ubuntu_install_tensorflow_aarch64.sh index 59cc5b4814b3..09efe5db5707 100755 --- a/docker/install/ubuntu_install_tensorflow_aarch64.sh +++ b/docker/install/ubuntu_install_tensorflow_aarch64.sh @@ -21,27 +21,8 @@ set -euxo pipefail # Build dependencies apt-install-and-clear -y --no-install-recommends libhdf5-dev -# Downloading Tensorflow and installing it manually is needed -# just as a temporary workaround while we move to a newer -# version (>2.7) that is hosted in the official PyPI repository. -linaro_repo="https://snapshots.linaro.org/ldcg/python/tensorflow-manylinux/43/tensorflow-aarch64" -tensorflow_package="tensorflow_aarch64-2.6.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl" -tmpdir=$(mktemp -d) - -cleanup() -{ - rm -rf "$tmpdir" -} - -trap cleanup 0 - -cd "${tmpdir}" -wget -q "${linaro_repo}/${tensorflow_package}" - # We're only using the TensorFlow wheel snapshot here as the # h5py wheel tries to use the wrong .so file pip3 install \ - ${tensorflow_package} \ - "h5py==3.1.0" \ - keras==2.6 \ - "protobuf<4" + keras==2.9 \ + tensorflow-aarch64==2.9.1 diff --git a/docker/install/ubuntu_install_tflite.sh b/docker/install/ubuntu_install_tflite.sh index 8a394302fdd3..4b73c202bc7f 100755 --- a/docker/install/ubuntu_install_tflite.sh +++ b/docker/install/ubuntu_install_tflite.sh @@ -18,6 +18,7 @@ set -e set -u +set -x set -o pipefail # The tflite version should have matched versions to the tensorflow @@ -38,8 +39,16 @@ pip3 install flatbuffers # The library is built at: # tensorflow/tensorflow/lite/tools/make/gen/*/lib/libtensorflow-lite.a. git clone https://github.com/tensorflow/tensorflow --branch=v${TENSORFLOW_VERSION} --depth 1 -./tensorflow/tensorflow/lite/tools/make/download_dependencies.sh -./tensorflow/tensorflow/lite/tools/make/build_lib.sh + +mkdir -p /opt/tflite +cd /opt/tflite +cmake \ + -DTFLITE_ENABLE_XNNPACK=OFF \ + /tensorflow/tensorflow/lite + +cmake --build . +cd - + # Setup tflite from schema mkdir tflite diff --git a/docker/install/ubuntu_install_vela.sh b/docker/install/ubuntu_install_vela.sh index c72d11823345..9e32889cd4eb 100755 --- a/docker/install/ubuntu_install_vela.sh +++ b/docker/install/ubuntu_install_vela.sh @@ -20,4 +20,4 @@ set -e set -u set -o pipefail -pip3 install ethos-u-vela==3.2.0 +pip3 install ethos-u-vela==3.4.0 diff --git a/docker/install/ubuntu_install_zephyr.sh b/docker/install/ubuntu_install_zephyr.sh index d25027f00709..f955a7ff9b19 100755 --- a/docker/install/ubuntu_install_zephyr.sh +++ b/docker/install/ubuntu_install_zephyr.sh @@ -31,7 +31,8 @@ sudo apt-install-and-clear -y --no-install-recommends \ git cmake ninja-build gperf \ ccache dfu-util device-tree-compiler wget \ python3-dev python3-pip python3-setuptools python3-tk python3-wheel python3-venv \ - xz-utils file make gcc gcc-multilib g++-multilib apt-transport-https + xz-utils file make gcc gcc-multilib g++-multilib apt-transport-https libudev-dev \ + libmagic1 wget --no-verbose https://apt.kitware.com/keys/kitware-archive-latest.asc sudo apt-key add kitware-archive-latest.asc From 7f788dca4ecc76203b3a1873154106d4127c4f98 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Wed, 7 Sep 2022 13:15:42 -0700 Subject: [PATCH 121/704] [ci] Add onnx model to S3 (#12716) Addresses this CI failure on `main`: https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/4235/pipeline/ Co-authored-by: driazati --- .github/workflows/upload_ci_resource.yml | 2 ++ tests/scripts/request_hook/request_hook.py | 1 + 2 files changed, 3 insertions(+) diff --git a/.github/workflows/upload_ci_resource.yml b/.github/workflows/upload_ci_resource.yml index 10bba56583c9..6d85c26c25b3 100644 --- a/.github/workflows/upload_ci_resource.yml +++ b/.github/workflows/upload_ci_resource.yml @@ -56,3 +56,5 @@ jobs: echo "$SHA256 downloaded_file" | sha256sum --check aws s3 cp downloaded_file "s3://tvm-ci-resources/$UPLOAD_PATH" echo "The item is available at https://tvm-ci-resources.s3.us-west-2.amazonaws.com/$UPLOAD_PATH" + echo "Add this line to tests/scripts/request_hook/request_hook.py" + echo " \"$URL\": f\"{BASE}/$UPLOAD_PATH\", diff --git a/tests/scripts/request_hook/request_hook.py b/tests/scripts/request_hook/request_hook.py index 1cabdba76b02..46448f0a38a8 100644 --- a/tests/scripts/request_hook/request_hook.py +++ b/tests/scripts/request_hook/request_hook.py @@ -40,6 +40,7 @@ "http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel": f"{BASE}/bvlc_alexnet.caffemodel", "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel": f"{BASE}/bvlc_googlenet.caffemodel", "https://github.com/dmlc/web-data/blob/main/darknet/data/dog.jpg": f"{BASE}/dog.jpg", + "https://github.com/onnx/models/raw/bd206494e8b6a27b25e5cf7199dbcdbfe9d05d1c/vision/classification/mnist/model/mnist-1.onnx": f"{BASE}/onnx/mnist-1.onnx", } From 546a7da2febe8ced256a4e9759413a9542c68d66 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Wed, 7 Sep 2022 13:17:12 -0700 Subject: [PATCH 122/704] [ci] Re-balance shards (#12473) Replace '> >' in templates with >>, NFC (#12615) The problem with greedy lexing of >> as an operator was solved in C++11, and now templates no longer require spaces between >'s. Co-authored-by: Krzysztof Parzyszek --- Jenkinsfile | 1397 +++++++------------ ci/jenkins/Test.groovy.j2 | 12 +- ci/jenkins/generate.py | 23 +- python/tvm/contrib/hexagon/pytest_plugin.py | 3 +- tests/scripts/setup-pytest-env.sh | 10 +- 5 files changed, 567 insertions(+), 878 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 2b73508da0d3..78071fde4599 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-08-30T15:26:50.100067 +// Generated at 2022-09-01T11:52:42.195970 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -1538,7 +1538,7 @@ def shard_run_unittest_GPU_3_of_3() { } -def shard_run_integration_CPU_1_of_10() { +def shard_run_integration_CPU_1_of_4() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { @@ -1549,7 +1549,7 @@ def shard_run_integration_CPU_1_of_10() { withEnv([ 'PLATFORM=cpu', 'TEST_STEP_NAME=integration: CPU', - 'TVM_NUM_SHARDS=10', + 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( @@ -1610,11 +1610,11 @@ def shard_run_integration_CPU_1_of_10() { } } } else { - Utils.markStageSkippedForConditional('integration: CPU 1 of 10') + Utils.markStageSkippedForConditional('integration: CPU 1 of 4') } } -def shard_run_integration_CPU_2_of_10() { +def shard_run_integration_CPU_2_of_4() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { @@ -1625,7 +1625,7 @@ def shard_run_integration_CPU_2_of_10() { withEnv([ 'PLATFORM=cpu', 'TEST_STEP_NAME=integration: CPU', - 'TVM_NUM_SHARDS=10', + 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( @@ -1686,11 +1686,11 @@ def shard_run_integration_CPU_2_of_10() { } } } else { - Utils.markStageSkippedForConditional('integration: CPU 2 of 10') + Utils.markStageSkippedForConditional('integration: CPU 2 of 4') } } -def shard_run_integration_CPU_3_of_10() { +def shard_run_integration_CPU_3_of_4() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { @@ -1701,7 +1701,7 @@ def shard_run_integration_CPU_3_of_10() { withEnv([ 'PLATFORM=cpu', 'TEST_STEP_NAME=integration: CPU', - 'TVM_NUM_SHARDS=10', + 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( @@ -1762,11 +1762,11 @@ def shard_run_integration_CPU_3_of_10() { } } } else { - Utils.markStageSkippedForConditional('integration: CPU 3 of 10') + Utils.markStageSkippedForConditional('integration: CPU 3 of 4') } } -def shard_run_integration_CPU_4_of_10() { +def shard_run_integration_CPU_4_of_4() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { @@ -1777,7 +1777,7 @@ def shard_run_integration_CPU_4_of_10() { withEnv([ 'PLATFORM=cpu', 'TEST_STEP_NAME=integration: CPU', - 'TVM_NUM_SHARDS=10', + 'TVM_NUM_SHARDS=4', 'TVM_SHARD_INDEX=3', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( @@ -1838,327 +1838,24 @@ def shard_run_integration_CPU_4_of_10() { } } } else { - Utils.markStageSkippedForConditional('integration: CPU 4 of 10') + Utils.markStageSkippedForConditional('integration: CPU 4 of 4') } } -def shard_run_integration_CPU_5_of_10() { - if (!skip_ci && is_docs_only_build != 1) { - node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { - try { - docker_init(ci_cpu) - init_git() - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=cpu', - 'TEST_STEP_NAME=integration: CPU', - 'TVM_NUM_SHARDS=10', - 'TVM_SHARD_INDEX=4', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: """ - set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so - md5sum build/libvta_tsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so - md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so - md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so - md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake - md5sum build/config.cmake - """, - label: 'Download artifacts from S3', - ) - - ci_setup(ci_cpu) - sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) - }) - } - } finally { - sh( - script: """ - set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive - """, - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } - } - } - } else { - Utils.markStageSkippedForConditional('integration: CPU 5 of 10') - } -} -def shard_run_integration_CPU_6_of_10() { +def shard_run_python_i386_1_of_3() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { - try { - docker_init(ci_cpu) - init_git() - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=cpu', - 'TEST_STEP_NAME=integration: CPU', - 'TVM_NUM_SHARDS=10', - 'TVM_SHARD_INDEX=5', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: """ - set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so - md5sum build/libvta_tsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so - md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so - md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so - md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake - md5sum build/config.cmake - """, - label: 'Download artifacts from S3', - ) - - ci_setup(ci_cpu) - sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) - }) - } - } finally { - sh( - script: """ - set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive - """, - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } - } - } - } else { - Utils.markStageSkippedForConditional('integration: CPU 6 of 10') - } -} - -def shard_run_integration_CPU_7_of_10() { - if (!skip_ci && is_docs_only_build != 1) { - node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { - try { - docker_init(ci_cpu) - init_git() - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=cpu', - 'TEST_STEP_NAME=integration: CPU', - 'TVM_NUM_SHARDS=10', - 'TVM_SHARD_INDEX=6', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: """ - set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so - md5sum build/libvta_tsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so - md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so - md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so - md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake - md5sum build/config.cmake - """, - label: 'Download artifacts from S3', - ) - - ci_setup(ci_cpu) - sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) - }) - } - } finally { - sh( - script: """ - set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive - """, - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } - } - } - } else { - Utils.markStageSkippedForConditional('integration: CPU 7 of 10') - } -} - -def shard_run_integration_CPU_8_of_10() { - if (!skip_ci && is_docs_only_build != 1) { - node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { - try { - docker_init(ci_cpu) - init_git() - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=cpu', - 'TEST_STEP_NAME=integration: CPU', - 'TVM_NUM_SHARDS=10', - 'TVM_SHARD_INDEX=7', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: """ - set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so - md5sum build/libvta_tsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so - md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so - md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so - md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake - md5sum build/config.cmake - """, - label: 'Download artifacts from S3', - ) - - ci_setup(ci_cpu) - sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) - }) - } - } finally { - sh( - script: """ - set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive - """, - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } - } - } - } else { - Utils.markStageSkippedForConditional('integration: CPU 8 of 10') - } -} - -def shard_run_integration_CPU_9_of_10() { - if (!skip_ci && is_docs_only_build != 1) { - node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") { try { - docker_init(ci_cpu) + docker_init(ci_i386) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=cpu', - 'TEST_STEP_NAME=integration: CPU', - 'TVM_NUM_SHARDS=10', - 'TVM_SHARD_INDEX=8', + 'PLATFORM=i386', + 'TEST_STEP_NAME=python: i386', + 'TVM_NUM_SHARDS=3', + 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -2183,24 +1880,24 @@ def shard_run_integration_CPU_9_of_10() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so - md5sum build/libvta_tsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake md5sum build/config.cmake """, label: 'Download artifacts from S3', ) - ci_setup(ci_cpu) + ci_setup(ci_i386) + cpp_unittest(ci_i386) + python_unittest(ci_i386) sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', + script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh", + label: 'Run i386 integration tests', ) }) } @@ -2208,7 +1905,7 @@ def shard_run_integration_CPU_9_of_10() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive """, label: 'Upload JUnits to S3', ) @@ -2218,23 +1915,23 @@ def shard_run_integration_CPU_9_of_10() { } } } else { - Utils.markStageSkippedForConditional('integration: CPU 9 of 10') + Utils.markStageSkippedForConditional('python: i386 1 of 3') } } -def shard_run_integration_CPU_10_of_10() { +def shard_run_python_i386_2_of_3() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") { try { - docker_init(ci_cpu) + docker_init(ci_i386) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=cpu', - 'TEST_STEP_NAME=integration: CPU', - 'TVM_NUM_SHARDS=10', - 'TVM_SHARD_INDEX=9', + 'PLATFORM=i386', + 'TEST_STEP_NAME=python: i386', + 'TVM_NUM_SHARDS=3', + 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -2259,32 +1956,32 @@ def shard_run_integration_CPU_10_of_10() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so - md5sum build/libvta_tsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake md5sum build/config.cmake """, label: 'Download artifacts from S3', ) - ci_setup(ci_cpu) + ci_setup(ci_i386) + python_unittest(ci_i386) sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', + script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh", + label: 'Run i386 integration tests', ) + fsim_test(ci_i386) }) } } finally { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_CPU --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive """, label: 'Upload JUnits to S3', ) @@ -2294,12 +1991,11 @@ def shard_run_integration_CPU_10_of_10() { } } } else { - Utils.markStageSkippedForConditional('integration: CPU 10 of 10') + Utils.markStageSkippedForConditional('python: i386 2 of 3') } } - -def shard_run_python_i386_1_of_5() { +def shard_run_python_i386_3_of_3() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") { @@ -2310,8 +2006,8 @@ def shard_run_python_i386_1_of_5() { withEnv([ 'PLATFORM=i386', 'TEST_STEP_NAME=python: i386', - 'TVM_NUM_SHARDS=5', - 'TVM_SHARD_INDEX=0', + 'TVM_NUM_SHARDS=3', + 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -2349,7 +2045,6 @@ def shard_run_python_i386_1_of_5() { ) ci_setup(ci_i386) - cpp_unittest(ci_i386) python_unittest(ci_i386) sh ( script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh", @@ -2371,23 +2066,24 @@ def shard_run_python_i386_1_of_5() { } } } else { - Utils.markStageSkippedForConditional('python: i386 1 of 5') + Utils.markStageSkippedForConditional('python: i386 3 of 3') } } -def shard_run_python_i386_2_of_5() { + +def shard_run_test_Hexagon_1_of_8() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { - docker_init(ci_i386) + docker_init(ci_hexagon) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=i386', - 'TEST_STEP_NAME=python: i386', - 'TVM_NUM_SHARDS=5', - 'TVM_SHARD_INDEX=1', + 'PLATFORM=hexagon', + 'TEST_STEP_NAME=test: Hexagon', + 'TVM_NUM_SHARDS=8', + 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -2412,32 +2108,31 @@ def shard_run_python_i386_2_of_5() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so - md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake md5sum build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive """, label: 'Download artifacts from S3', ) - ci_setup(ci_i386) - python_unittest(ci_i386) + add_hexagon_permissions() + ci_setup(ci_hexagon) + cpp_unittest(ci_hexagon) sh ( - script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh", - label: 'Run i386 integration tests', + script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", + label: 'Run Hexagon tests', ) - fsim_test(ci_i386) }) } } finally { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive """, label: 'Upload JUnits to S3', ) @@ -2447,23 +2142,23 @@ def shard_run_python_i386_2_of_5() { } } } else { - Utils.markStageSkippedForConditional('python: i386 2 of 5') + Utils.markStageSkippedForConditional('test: Hexagon 1 of 8') } } -def shard_run_python_i386_3_of_5() { +def shard_run_test_Hexagon_2_of_8() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { - docker_init(ci_i386) + docker_init(ci_hexagon) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=i386', - 'TEST_STEP_NAME=python: i386', - 'TVM_NUM_SHARDS=5', - 'TVM_SHARD_INDEX=2', + 'PLATFORM=hexagon', + 'TEST_STEP_NAME=test: Hexagon', + 'TVM_NUM_SHARDS=8', + 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -2488,23 +2183,22 @@ def shard_run_python_i386_3_of_5() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so - md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake md5sum build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive """, label: 'Download artifacts from S3', ) - ci_setup(ci_i386) - python_unittest(ci_i386) + add_hexagon_permissions() + ci_setup(ci_hexagon) sh ( - script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh", - label: 'Run i386 integration tests', + script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", + label: 'Run Hexagon tests', ) }) } @@ -2512,7 +2206,7 @@ def shard_run_python_i386_3_of_5() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive """, label: 'Upload JUnits to S3', ) @@ -2522,23 +2216,23 @@ def shard_run_python_i386_3_of_5() { } } } else { - Utils.markStageSkippedForConditional('python: i386 3 of 5') + Utils.markStageSkippedForConditional('test: Hexagon 2 of 8') } } -def shard_run_python_i386_4_of_5() { +def shard_run_test_Hexagon_3_of_8() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { - docker_init(ci_i386) + docker_init(ci_hexagon) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=i386', - 'TEST_STEP_NAME=python: i386', - 'TVM_NUM_SHARDS=5', - 'TVM_SHARD_INDEX=3', + 'PLATFORM=hexagon', + 'TEST_STEP_NAME=test: Hexagon', + 'TVM_NUM_SHARDS=8', + 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -2563,23 +2257,22 @@ def shard_run_python_i386_4_of_5() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so - md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake md5sum build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive """, label: 'Download artifacts from S3', ) - ci_setup(ci_i386) - python_unittest(ci_i386) + add_hexagon_permissions() + ci_setup(ci_hexagon) sh ( - script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh", - label: 'Run i386 integration tests', + script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", + label: 'Run Hexagon tests', ) }) } @@ -2587,7 +2280,7 @@ def shard_run_python_i386_4_of_5() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive """, label: 'Upload JUnits to S3', ) @@ -2597,23 +2290,23 @@ def shard_run_python_i386_4_of_5() { } } } else { - Utils.markStageSkippedForConditional('python: i386 4 of 5') + Utils.markStageSkippedForConditional('test: Hexagon 3 of 8') } } -def shard_run_python_i386_5_of_5() { +def shard_run_test_Hexagon_4_of_8() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { - docker_init(ci_i386) + docker_init(ci_hexagon) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=i386', - 'TEST_STEP_NAME=python: i386', - 'TVM_NUM_SHARDS=5', - 'TVM_SHARD_INDEX=4', + 'PLATFORM=hexagon', + 'TEST_STEP_NAME=test: Hexagon', + 'TVM_NUM_SHARDS=8', + 'TVM_SHARD_INDEX=3', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -2638,23 +2331,22 @@ def shard_run_python_i386_5_of_5() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so - md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake md5sum build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive """, label: 'Download artifacts from S3', ) - ci_setup(ci_i386) - python_unittest(ci_i386) + add_hexagon_permissions() + ci_setup(ci_hexagon) sh ( - script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh", - label: 'Run i386 integration tests', + script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", + label: 'Run Hexagon tests', ) }) } @@ -2662,7 +2354,7 @@ def shard_run_python_i386_5_of_5() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/python_i386 --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive """, label: 'Upload JUnits to S3', ) @@ -2672,12 +2364,11 @@ def shard_run_python_i386_5_of_5() { } } } else { - Utils.markStageSkippedForConditional('python: i386 5 of 5') + Utils.markStageSkippedForConditional('test: Hexagon 4 of 8') } } - -def shard_run_test_Hexagon_1_of_7() { +def shard_run_test_Hexagon_5_of_8() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { @@ -2688,8 +2379,8 @@ def shard_run_test_Hexagon_1_of_7() { withEnv([ 'PLATFORM=hexagon', 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=7', - 'TVM_SHARD_INDEX=0', + 'TVM_NUM_SHARDS=8', + 'TVM_SHARD_INDEX=4', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -2727,7 +2418,6 @@ def shard_run_test_Hexagon_1_of_7() { add_hexagon_permissions() ci_setup(ci_hexagon) - cpp_unittest(ci_hexagon) sh ( script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", label: 'Run Hexagon tests', @@ -2748,11 +2438,11 @@ def shard_run_test_Hexagon_1_of_7() { } } } else { - Utils.markStageSkippedForConditional('test: Hexagon 1 of 7') + Utils.markStageSkippedForConditional('test: Hexagon 5 of 8') } } -def shard_run_test_Hexagon_2_of_7() { +def shard_run_test_Hexagon_6_of_8() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { @@ -2763,8 +2453,8 @@ def shard_run_test_Hexagon_2_of_7() { withEnv([ 'PLATFORM=hexagon', 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=7', - 'TVM_SHARD_INDEX=1', + 'TVM_NUM_SHARDS=8', + 'TVM_SHARD_INDEX=5', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -2822,11 +2512,11 @@ def shard_run_test_Hexagon_2_of_7() { } } } else { - Utils.markStageSkippedForConditional('test: Hexagon 2 of 7') + Utils.markStageSkippedForConditional('test: Hexagon 6 of 8') } } -def shard_run_test_Hexagon_3_of_7() { +def shard_run_test_Hexagon_7_of_8() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { @@ -2837,8 +2527,8 @@ def shard_run_test_Hexagon_3_of_7() { withEnv([ 'PLATFORM=hexagon', 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=7', - 'TVM_SHARD_INDEX=2', + 'TVM_NUM_SHARDS=8', + 'TVM_SHARD_INDEX=6', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -2896,11 +2586,11 @@ def shard_run_test_Hexagon_3_of_7() { } } } else { - Utils.markStageSkippedForConditional('test: Hexagon 3 of 7') + Utils.markStageSkippedForConditional('test: Hexagon 7 of 8') } } -def shard_run_test_Hexagon_4_of_7() { +def shard_run_test_Hexagon_8_of_8() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { @@ -2911,8 +2601,8 @@ def shard_run_test_Hexagon_4_of_7() { withEnv([ 'PLATFORM=hexagon', 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=7', - 'TVM_SHARD_INDEX=3', + 'TVM_NUM_SHARDS=8', + 'TVM_SHARD_INDEX=7', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -2970,23 +2660,24 @@ def shard_run_test_Hexagon_4_of_7() { } } } else { - Utils.markStageSkippedForConditional('test: Hexagon 4 of 7') + Utils.markStageSkippedForConditional('test: Hexagon 8 of 8') } } -def shard_run_test_Hexagon_5_of_7() { + +def shard_run_integration_aarch64_1_of_4() { if (!skip_ci && is_docs_only_build != 1) { - node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { + node('ARM-SMALL') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { try { - docker_init(ci_hexagon) + docker_init(ci_arm) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=hexagon', - 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=7', - 'TVM_SHARD_INDEX=4', + 'PLATFORM=arm', + 'TEST_STEP_NAME=integration: aarch64', + 'TVM_NUM_SHARDS=4', + 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -3011,22 +2702,23 @@ def shard_run_test_Hexagon_5_of_7() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake md5sum build/config.cmake - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive """, label: 'Download artifacts from S3', ) - add_hexagon_permissions() - ci_setup(ci_hexagon) + ci_setup(ci_arm) + python_unittest(ci_arm) sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", - label: 'Run Hexagon tests', + script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh", + label: 'Run CPU integration tests', ) }) } @@ -3034,7 +2726,7 @@ def shard_run_test_Hexagon_5_of_7() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive """, label: 'Upload JUnits to S3', ) @@ -3044,23 +2736,23 @@ def shard_run_test_Hexagon_5_of_7() { } } } else { - Utils.markStageSkippedForConditional('test: Hexagon 5 of 7') + Utils.markStageSkippedForConditional('integration: aarch64 1 of 4') } } -def shard_run_test_Hexagon_6_of_7() { +def shard_run_integration_aarch64_2_of_4() { if (!skip_ci && is_docs_only_build != 1) { - node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { + node('ARM-SMALL') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { try { - docker_init(ci_hexagon) + docker_init(ci_arm) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=hexagon', - 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=7', - 'TVM_SHARD_INDEX=5', + 'PLATFORM=arm', + 'TEST_STEP_NAME=integration: aarch64', + 'TVM_NUM_SHARDS=4', + 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -3085,22 +2777,23 @@ def shard_run_test_Hexagon_6_of_7() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake md5sum build/config.cmake - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive """, label: 'Download artifacts from S3', ) - add_hexagon_permissions() - ci_setup(ci_hexagon) + ci_setup(ci_arm) + python_unittest(ci_arm) sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", - label: 'Run Hexagon tests', + script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh", + label: 'Run CPU integration tests', ) }) } @@ -3108,7 +2801,7 @@ def shard_run_test_Hexagon_6_of_7() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive """, label: 'Upload JUnits to S3', ) @@ -3118,23 +2811,23 @@ def shard_run_test_Hexagon_6_of_7() { } } } else { - Utils.markStageSkippedForConditional('test: Hexagon 6 of 7') + Utils.markStageSkippedForConditional('integration: aarch64 2 of 4') } } -def shard_run_test_Hexagon_7_of_7() { +def shard_run_integration_aarch64_3_of_4() { if (!skip_ci && is_docs_only_build != 1) { - node('CPU-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { + node('ARM-SMALL') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { try { - docker_init(ci_hexagon) + docker_init(ci_arm) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=hexagon', - 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=7', - 'TVM_SHARD_INDEX=6', + 'PLATFORM=arm', + 'TEST_STEP_NAME=integration: aarch64', + 'TVM_NUM_SHARDS=4', + 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -3159,22 +2852,23 @@ def shard_run_test_Hexagon_7_of_7() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so + md5sum build/libvta_fsim.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake md5sum build/config.cmake - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive """, label: 'Download artifacts from S3', ) - add_hexagon_permissions() - ci_setup(ci_hexagon) + ci_setup(ci_arm) + python_unittest(ci_arm) sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", - label: 'Run Hexagon tests', + script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh", + label: 'Run CPU integration tests', ) }) } @@ -3182,7 +2876,7 @@ def shard_run_test_Hexagon_7_of_7() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Hexagon --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive """, label: 'Upload JUnits to S3', ) @@ -3192,12 +2886,11 @@ def shard_run_test_Hexagon_7_of_7() { } } } else { - Utils.markStageSkippedForConditional('test: Hexagon 7 of 7') + Utils.markStageSkippedForConditional('integration: aarch64 3 of 4') } } - -def shard_run_integration_aarch64_1_of_4() { +def shard_run_integration_aarch64_4_of_4() { if (!skip_ci && is_docs_only_build != 1) { node('ARM-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { @@ -3209,7 +2902,7 @@ def shard_run_integration_aarch64_1_of_4() { 'PLATFORM=arm', 'TEST_STEP_NAME=integration: aarch64', 'TVM_NUM_SHARDS=4', - 'TVM_SHARD_INDEX=0', + 'TVM_SHARD_INDEX=3', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -3268,23 +2961,24 @@ def shard_run_integration_aarch64_1_of_4() { } } } else { - Utils.markStageSkippedForConditional('integration: aarch64 1 of 4') + Utils.markStageSkippedForConditional('integration: aarch64 4 of 4') } } -def shard_run_integration_aarch64_2_of_4() { + +def shard_run_topi_GPU_1_of_3() { if (!skip_ci && is_docs_only_build != 1) { - node('ARM-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { + node('GPU') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") { try { - docker_init(ci_arm) + docker_init(ci_gpu) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=arm', - 'TEST_STEP_NAME=integration: aarch64', - 'TVM_NUM_SHARDS=4', - 'TVM_SHARD_INDEX=1', + 'PLATFORM=gpu', + 'TEST_STEP_NAME=topi: GPU', + 'TVM_NUM_SHARDS=3', + 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -3309,23 +3003,22 @@ def shard_run_integration_aarch64_2_of_4() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake md5sum build/config.cmake """, label: 'Download artifacts from S3', ) - ci_setup(ci_arm) - python_unittest(ci_arm) + ci_setup(ci_gpu) sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', + script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh", + label: 'Run TOPI tests', ) }) } @@ -3333,7 +3026,7 @@ def shard_run_integration_aarch64_2_of_4() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive """, label: 'Upload JUnits to S3', ) @@ -3343,23 +3036,23 @@ def shard_run_integration_aarch64_2_of_4() { } } } else { - Utils.markStageSkippedForConditional('integration: aarch64 2 of 4') + Utils.markStageSkippedForConditional('topi: GPU 1 of 3') } } -def shard_run_integration_aarch64_3_of_4() { +def shard_run_topi_GPU_2_of_3() { if (!skip_ci && is_docs_only_build != 1) { - node('ARM-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { + node('GPU') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") { try { - docker_init(ci_arm) + docker_init(ci_gpu) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=arm', - 'TEST_STEP_NAME=integration: aarch64', - 'TVM_NUM_SHARDS=4', - 'TVM_SHARD_INDEX=2', + 'PLATFORM=gpu', + 'TEST_STEP_NAME=topi: GPU', + 'TVM_NUM_SHARDS=3', + 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -3384,23 +3077,22 @@ def shard_run_integration_aarch64_3_of_4() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake md5sum build/config.cmake """, label: 'Download artifacts from S3', ) - ci_setup(ci_arm) - python_unittest(ci_arm) + ci_setup(ci_gpu) sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', + script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh", + label: 'Run TOPI tests', ) }) } @@ -3408,7 +3100,7 @@ def shard_run_integration_aarch64_3_of_4() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive """, label: 'Upload JUnits to S3', ) @@ -3418,23 +3110,23 @@ def shard_run_integration_aarch64_3_of_4() { } } } else { - Utils.markStageSkippedForConditional('integration: aarch64 3 of 4') + Utils.markStageSkippedForConditional('topi: GPU 2 of 3') } } -def shard_run_integration_aarch64_4_of_4() { +def shard_run_topi_GPU_3_of_3() { if (!skip_ci && is_docs_only_build != 1) { - node('ARM-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { + node('GPU') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") { try { - docker_init(ci_arm) + docker_init(ci_gpu) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=arm', - 'TEST_STEP_NAME=integration: aarch64', - 'TVM_NUM_SHARDS=4', - 'TVM_SHARD_INDEX=3', + 'PLATFORM=gpu', + 'TEST_STEP_NAME=topi: GPU', + 'TVM_NUM_SHARDS=3', + 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -3459,23 +3151,22 @@ def shard_run_integration_aarch64_4_of_4() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake md5sum build/config.cmake """, label: 'Download artifacts from S3', ) - ci_setup(ci_arm) - python_unittest(ci_arm) + ci_setup(ci_gpu) sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', + script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh", + label: 'Run TOPI tests', ) }) } @@ -3483,7 +3174,7 @@ def shard_run_integration_aarch64_4_of_4() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/integration_aarch64 --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive """, label: 'Upload JUnits to S3', ) @@ -3493,23 +3184,23 @@ def shard_run_integration_aarch64_4_of_4() { } } } else { - Utils.markStageSkippedForConditional('integration: aarch64 4 of 4') + Utils.markStageSkippedForConditional('topi: GPU 3 of 3') } } -def shard_run_topi_GPU_1_of_4() { +def shard_run_frontend_GPU_1_of_6() { if (!skip_ci && is_docs_only_build != 1) { node('GPU') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { try { docker_init(ci_gpu) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', - 'TEST_STEP_NAME=topi: GPU', - 'TVM_NUM_SHARDS=4', + 'TEST_STEP_NAME=frontend: GPU', + 'TVM_NUM_SHARDS=6', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( @@ -3549,8 +3240,8 @@ def shard_run_topi_GPU_1_of_4() { ci_setup(ci_gpu) sh ( - script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh", - label: 'Run TOPI tests', + script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh", + label: 'Run Python frontend tests', ) }) } @@ -3558,7 +3249,7 @@ def shard_run_topi_GPU_1_of_4() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive """, label: 'Upload JUnits to S3', ) @@ -3568,22 +3259,22 @@ def shard_run_topi_GPU_1_of_4() { } } } else { - Utils.markStageSkippedForConditional('topi: GPU 1 of 4') + Utils.markStageSkippedForConditional('frontend: GPU 1 of 6') } } -def shard_run_topi_GPU_2_of_4() { +def shard_run_frontend_GPU_2_of_6() { if (!skip_ci && is_docs_only_build != 1) { node('GPU') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { try { docker_init(ci_gpu) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', - 'TEST_STEP_NAME=topi: GPU', - 'TVM_NUM_SHARDS=4', + 'TEST_STEP_NAME=frontend: GPU', + 'TVM_NUM_SHARDS=6', 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( @@ -3623,8 +3314,8 @@ def shard_run_topi_GPU_2_of_4() { ci_setup(ci_gpu) sh ( - script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh", - label: 'Run TOPI tests', + script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh", + label: 'Run Python frontend tests', ) }) } @@ -3632,7 +3323,7 @@ def shard_run_topi_GPU_2_of_4() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive """, label: 'Upload JUnits to S3', ) @@ -3642,22 +3333,22 @@ def shard_run_topi_GPU_2_of_4() { } } } else { - Utils.markStageSkippedForConditional('topi: GPU 2 of 4') + Utils.markStageSkippedForConditional('frontend: GPU 2 of 6') } } -def shard_run_topi_GPU_3_of_4() { +def shard_run_frontend_GPU_3_of_6() { if (!skip_ci && is_docs_only_build != 1) { node('GPU') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { try { docker_init(ci_gpu) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', - 'TEST_STEP_NAME=topi: GPU', - 'TVM_NUM_SHARDS=4', + 'TEST_STEP_NAME=frontend: GPU', + 'TVM_NUM_SHARDS=6', 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( @@ -3697,8 +3388,8 @@ def shard_run_topi_GPU_3_of_4() { ci_setup(ci_gpu) sh ( - script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh", - label: 'Run TOPI tests', + script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh", + label: 'Run Python frontend tests', ) }) } @@ -3706,7 +3397,7 @@ def shard_run_topi_GPU_3_of_4() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive """, label: 'Upload JUnits to S3', ) @@ -3716,22 +3407,22 @@ def shard_run_topi_GPU_3_of_4() { } } } else { - Utils.markStageSkippedForConditional('topi: GPU 3 of 4') + Utils.markStageSkippedForConditional('frontend: GPU 3 of 6') } } -def shard_run_topi_GPU_4_of_4() { +def shard_run_frontend_GPU_4_of_6() { if (!skip_ci && is_docs_only_build != 1) { node('GPU') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { try { docker_init(ci_gpu) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', - 'TEST_STEP_NAME=topi: GPU', - 'TVM_NUM_SHARDS=4', + 'TEST_STEP_NAME=frontend: GPU', + 'TVM_NUM_SHARDS=6', 'TVM_SHARD_INDEX=3', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( @@ -3771,8 +3462,8 @@ def shard_run_topi_GPU_4_of_4() { ci_setup(ci_gpu) sh ( - script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh", - label: 'Run TOPI tests', + script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh", + label: 'Run Python frontend tests', ) }) } @@ -3780,7 +3471,7 @@ def shard_run_topi_GPU_4_of_4() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_GPU --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive """, label: 'Upload JUnits to S3', ) @@ -3790,12 +3481,11 @@ def shard_run_topi_GPU_4_of_4() { } } } else { - Utils.markStageSkippedForConditional('topi: GPU 4 of 4') + Utils.markStageSkippedForConditional('frontend: GPU 4 of 6') } } - -def shard_run_frontend_GPU_1_of_6() { +def shard_run_frontend_GPU_5_of_6() { if (!skip_ci && is_docs_only_build != 1) { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { @@ -3807,7 +3497,7 @@ def shard_run_frontend_GPU_1_of_6() { 'PLATFORM=gpu', 'TEST_STEP_NAME=frontend: GPU', 'TVM_NUM_SHARDS=6', - 'TVM_SHARD_INDEX=0', + 'TVM_SHARD_INDEX=4', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -3865,11 +3555,11 @@ def shard_run_frontend_GPU_1_of_6() { } } } else { - Utils.markStageSkippedForConditional('frontend: GPU 1 of 6') + Utils.markStageSkippedForConditional('frontend: GPU 5 of 6') } } -def shard_run_frontend_GPU_2_of_6() { +def shard_run_frontend_GPU_6_of_6() { if (!skip_ci && is_docs_only_build != 1) { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { @@ -3881,7 +3571,7 @@ def shard_run_frontend_GPU_2_of_6() { 'PLATFORM=gpu', 'TEST_STEP_NAME=frontend: GPU', 'TVM_NUM_SHARDS=6', - 'TVM_SHARD_INDEX=1', + 'TVM_SHARD_INDEX=5', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -3939,23 +3629,24 @@ def shard_run_frontend_GPU_2_of_6() { } } } else { - Utils.markStageSkippedForConditional('frontend: GPU 2 of 6') + Utils.markStageSkippedForConditional('frontend: GPU 6 of 6') } } -def shard_run_frontend_GPU_3_of_6() { + +def shard_run_topi_aarch64_1_of_2() { if (!skip_ci && is_docs_only_build != 1) { - node('GPU') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { + node('ARM-SMALL') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { try { - docker_init(ci_gpu) + docker_init(ci_arm) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=gpu', - 'TEST_STEP_NAME=frontend: GPU', - 'TVM_NUM_SHARDS=6', - 'TVM_SHARD_INDEX=2', + 'PLATFORM=arm', + 'TEST_STEP_NAME=topi: aarch64', + 'TVM_NUM_SHARDS=2', + 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -3980,22 +3671,27 @@ def shard_run_frontend_GPU_3_of_6() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake md5sum build/config.cmake """, label: 'Download artifacts from S3', ) - ci_setup(ci_gpu) + ci_setup(ci_arm) + cpp_unittest(ci_arm) sh ( - script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh", - label: 'Run Python frontend tests', + script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh", + label: 'Run test_arm_compute_lib test', + ) + sh ( + script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh", + label: 'Run TOPI tests', ) }) } @@ -4003,7 +3699,7 @@ def shard_run_frontend_GPU_3_of_6() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_aarch64 --recursive """, label: 'Upload JUnits to S3', ) @@ -4013,23 +3709,23 @@ def shard_run_frontend_GPU_3_of_6() { } } } else { - Utils.markStageSkippedForConditional('frontend: GPU 3 of 6') + Utils.markStageSkippedForConditional('topi: aarch64 1 of 2') } } -def shard_run_frontend_GPU_4_of_6() { +def shard_run_topi_aarch64_2_of_2() { if (!skip_ci && is_docs_only_build != 1) { - node('GPU') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { + node('ARM-SMALL') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { try { - docker_init(ci_gpu) + docker_init(ci_arm) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=gpu', - 'TEST_STEP_NAME=frontend: GPU', - 'TVM_NUM_SHARDS=6', - 'TVM_SHARD_INDEX=3', + 'PLATFORM=arm', + 'TEST_STEP_NAME=topi: aarch64', + 'TVM_NUM_SHARDS=2', + 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -4054,22 +3750,26 @@ def shard_run_frontend_GPU_4_of_6() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake md5sum build/config.cmake """, label: 'Download artifacts from S3', ) - ci_setup(ci_gpu) + ci_setup(ci_arm) + sh ( + script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh", + label: 'Run test_arm_compute_lib test', + ) sh ( - script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh", - label: 'Run Python frontend tests', + script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh", + label: 'Run TOPI tests', ) }) } @@ -4077,7 +3777,7 @@ def shard_run_frontend_GPU_4_of_6() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_aarch64 --recursive """, label: 'Upload JUnits to S3', ) @@ -4087,23 +3787,24 @@ def shard_run_frontend_GPU_4_of_6() { } } } else { - Utils.markStageSkippedForConditional('frontend: GPU 4 of 6') + Utils.markStageSkippedForConditional('topi: aarch64 2 of 2') } } -def shard_run_frontend_GPU_5_of_6() { + +def shard_run_frontend_aarch64_1_of_2() { if (!skip_ci && is_docs_only_build != 1) { - node('GPU') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { + node('ARM-SMALL') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") { try { - docker_init(ci_gpu) + docker_init(ci_arm) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=gpu', - 'TEST_STEP_NAME=frontend: GPU', - 'TVM_NUM_SHARDS=6', - 'TVM_SHARD_INDEX=4', + 'PLATFORM=arm', + 'TEST_STEP_NAME=frontend: aarch64', + 'TVM_NUM_SHARDS=2', + 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -4128,21 +3829,21 @@ def shard_run_frontend_GPU_5_of_6() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake md5sum build/config.cmake """, label: 'Download artifacts from S3', ) - ci_setup(ci_gpu) + ci_setup(ci_arm) sh ( - script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh", + script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh", label: 'Run Python frontend tests', ) }) @@ -4151,7 +3852,7 @@ def shard_run_frontend_GPU_5_of_6() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_aarch64 --recursive """, label: 'Upload JUnits to S3', ) @@ -4161,23 +3862,23 @@ def shard_run_frontend_GPU_5_of_6() { } } } else { - Utils.markStageSkippedForConditional('frontend: GPU 5 of 6') + Utils.markStageSkippedForConditional('frontend: aarch64 1 of 2') } } -def shard_run_frontend_GPU_6_of_6() { +def shard_run_frontend_aarch64_2_of_2() { if (!skip_ci && is_docs_only_build != 1) { - node('GPU') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { + node('ARM-SMALL') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") { try { - docker_init(ci_gpu) + docker_init(ci_arm) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=gpu', - 'TEST_STEP_NAME=frontend: GPU', - 'TVM_NUM_SHARDS=6', - 'TVM_SHARD_INDEX=5', + 'PLATFORM=arm', + 'TEST_STEP_NAME=frontend: aarch64', + 'TVM_NUM_SHARDS=2', + 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -4202,21 +3903,21 @@ def shard_run_frontend_GPU_6_of_6() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake md5sum build/config.cmake """, label: 'Download artifacts from S3', ) - ci_setup(ci_gpu) + ci_setup(ci_arm) sh ( - script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh", + script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh", label: 'Run Python frontend tests', ) }) @@ -4225,7 +3926,7 @@ def shard_run_frontend_GPU_6_of_6() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_GPU --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_aarch64 --recursive """, label: 'Upload JUnits to S3', ) @@ -4235,23 +3936,23 @@ def shard_run_frontend_GPU_6_of_6() { } } } else { - Utils.markStageSkippedForConditional('frontend: GPU 6 of 6') + Utils.markStageSkippedForConditional('frontend: aarch64 2 of 2') } } -def shard_run_topi_aarch64_1_of_2() { +def shard_run_test_Cortex_M_1_of_12() { if (!skip_ci && is_docs_only_build != 1) { - node('ARM-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { + node('CPU-SMALL') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_arm) + docker_init(ci_cortexm) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=arm', - 'TEST_STEP_NAME=topi: aarch64', - 'TVM_NUM_SHARDS=2', + 'PLATFORM=cortexm', + 'TEST_STEP_NAME=test: Cortex-M', + 'TVM_NUM_SHARDS=12', 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( @@ -4277,27 +3978,27 @@ def shard_run_topi_aarch64_1_of_2() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so - md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake md5sum build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive """, label: 'Download artifacts from S3', ) - ci_setup(ci_arm) - cpp_unittest(ci_arm) + add_microtvm_permissions() + ci_setup(ci_cortexm) + cpp_unittest(ci_cortexm) sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh", - label: 'Run test_arm_compute_lib test', + script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_demo_microtvm.sh", + label: 'Run microTVM demos', ) sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh", - label: 'Run TOPI tests', + script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh", + label: 'Run microTVM tests', ) }) } @@ -4305,7 +4006,7 @@ def shard_run_topi_aarch64_1_of_2() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_aarch64 --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive """, label: 'Upload JUnits to S3', ) @@ -4315,22 +4016,22 @@ def shard_run_topi_aarch64_1_of_2() { } } } else { - Utils.markStageSkippedForConditional('topi: aarch64 1 of 2') + Utils.markStageSkippedForConditional('test: Cortex-M 1 of 12') } } -def shard_run_topi_aarch64_2_of_2() { +def shard_run_test_Cortex_M_2_of_12() { if (!skip_ci && is_docs_only_build != 1) { - node('ARM-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { + node('CPU-SMALL') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_arm) + docker_init(ci_cortexm) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=arm', - 'TEST_STEP_NAME=topi: aarch64', - 'TVM_NUM_SHARDS=2', + 'PLATFORM=cortexm', + 'TEST_STEP_NAME=test: Cortex-M', + 'TVM_NUM_SHARDS=12', 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( @@ -4356,26 +4057,22 @@ def shard_run_topi_aarch64_2_of_2() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so - md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake md5sum build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive """, label: 'Download artifacts from S3', ) - ci_setup(ci_arm) - sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh", - label: 'Run test_arm_compute_lib test', - ) + add_microtvm_permissions() + ci_setup(ci_cortexm) sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh", - label: 'Run TOPI tests', + script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh", + label: 'Run microTVM tests', ) }) } @@ -4383,7 +4080,7 @@ def shard_run_topi_aarch64_2_of_2() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/topi_aarch64 --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive """, label: 'Upload JUnits to S3', ) @@ -4393,24 +4090,23 @@ def shard_run_topi_aarch64_2_of_2() { } } } else { - Utils.markStageSkippedForConditional('topi: aarch64 2 of 2') + Utils.markStageSkippedForConditional('test: Cortex-M 2 of 12') } } - -def shard_run_frontend_aarch64_1_of_2() { +def shard_run_test_Cortex_M_3_of_12() { if (!skip_ci && is_docs_only_build != 1) { - node('ARM-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") { + node('CPU-SMALL') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_arm) + docker_init(ci_cortexm) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=arm', - 'TEST_STEP_NAME=frontend: aarch64', - 'TVM_NUM_SHARDS=2', - 'TVM_SHARD_INDEX=0', + 'PLATFORM=cortexm', + 'TEST_STEP_NAME=test: Cortex-M', + 'TVM_NUM_SHARDS=12', + 'TVM_SHARD_INDEX=2', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -4435,22 +4131,22 @@ def shard_run_frontend_aarch64_1_of_2() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so - md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake md5sum build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive """, label: 'Download artifacts from S3', ) - ci_setup(ci_arm) + add_microtvm_permissions() + ci_setup(ci_cortexm) sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh", - label: 'Run Python frontend tests', + script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh", + label: 'Run microTVM tests', ) }) } @@ -4458,7 +4154,7 @@ def shard_run_frontend_aarch64_1_of_2() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_aarch64 --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive """, label: 'Upload JUnits to S3', ) @@ -4468,23 +4164,23 @@ def shard_run_frontend_aarch64_1_of_2() { } } } else { - Utils.markStageSkippedForConditional('frontend: aarch64 1 of 2') + Utils.markStageSkippedForConditional('test: Cortex-M 3 of 12') } } -def shard_run_frontend_aarch64_2_of_2() { +def shard_run_test_Cortex_M_4_of_12() { if (!skip_ci && is_docs_only_build != 1) { - node('ARM-SMALL') { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") { + node('CPU-SMALL') { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_arm) + docker_init(ci_cortexm) init_git() timeout(time: max_time, unit: 'MINUTES') { withEnv([ - 'PLATFORM=arm', - 'TEST_STEP_NAME=frontend: aarch64', - 'TVM_NUM_SHARDS=2', - 'TVM_SHARD_INDEX=1', + 'PLATFORM=cortexm', + 'TEST_STEP_NAME=test: Cortex-M', + 'TVM_NUM_SHARDS=12', + 'TVM_SHARD_INDEX=3', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -4509,22 +4205,22 @@ def shard_run_frontend_aarch64_2_of_2() { done } - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so - md5sum build/libvta_fsim.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so md5sum build/libtvm_runtime.so - retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/config.cmake build/config.cmake md5sum build/config.cmake + retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/microtvm_template_projects build/microtvm_template_projects --recursive """, label: 'Download artifacts from S3', ) - ci_setup(ci_arm) + add_microtvm_permissions() + ci_setup(ci_cortexm) sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh", - label: 'Run Python frontend tests', + script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh", + label: 'Run microTVM tests', ) }) } @@ -4532,7 +4228,7 @@ def shard_run_frontend_aarch64_2_of_2() { sh( script: """ set -eux - aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/frontend_aarch64 --recursive + aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results/test_Cortex_M --recursive """, label: 'Upload JUnits to S3', ) @@ -4542,12 +4238,11 @@ def shard_run_frontend_aarch64_2_of_2() { } } } else { - Utils.markStageSkippedForConditional('frontend: aarch64 2 of 2') + Utils.markStageSkippedForConditional('test: Cortex-M 4 of 12') } } - -def shard_run_test_Cortex_M_1_of_8() { +def shard_run_test_Cortex_M_5_of_12() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { @@ -4558,8 +4253,8 @@ def shard_run_test_Cortex_M_1_of_8() { withEnv([ 'PLATFORM=cortexm', 'TEST_STEP_NAME=test: Cortex-M', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=0', + 'TVM_NUM_SHARDS=12', + 'TVM_SHARD_INDEX=4', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -4597,11 +4292,6 @@ def shard_run_test_Cortex_M_1_of_8() { add_microtvm_permissions() ci_setup(ci_cortexm) - cpp_unittest(ci_cortexm) - sh ( - script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_demo_microtvm.sh", - label: 'Run microTVM demos', - ) sh ( script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_python_microtvm.sh", label: 'Run microTVM tests', @@ -4622,11 +4312,11 @@ def shard_run_test_Cortex_M_1_of_8() { } } } else { - Utils.markStageSkippedForConditional('test: Cortex-M 1 of 8') + Utils.markStageSkippedForConditional('test: Cortex-M 5 of 12') } } -def shard_run_test_Cortex_M_2_of_8() { +def shard_run_test_Cortex_M_6_of_12() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { @@ -4637,8 +4327,8 @@ def shard_run_test_Cortex_M_2_of_8() { withEnv([ 'PLATFORM=cortexm', 'TEST_STEP_NAME=test: Cortex-M', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=1', + 'TVM_NUM_SHARDS=12', + 'TVM_SHARD_INDEX=5', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -4696,11 +4386,11 @@ def shard_run_test_Cortex_M_2_of_8() { } } } else { - Utils.markStageSkippedForConditional('test: Cortex-M 2 of 8') + Utils.markStageSkippedForConditional('test: Cortex-M 6 of 12') } } -def shard_run_test_Cortex_M_3_of_8() { +def shard_run_test_Cortex_M_7_of_12() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { @@ -4711,8 +4401,8 @@ def shard_run_test_Cortex_M_3_of_8() { withEnv([ 'PLATFORM=cortexm', 'TEST_STEP_NAME=test: Cortex-M', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=2', + 'TVM_NUM_SHARDS=12', + 'TVM_SHARD_INDEX=6', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -4770,11 +4460,11 @@ def shard_run_test_Cortex_M_3_of_8() { } } } else { - Utils.markStageSkippedForConditional('test: Cortex-M 3 of 8') + Utils.markStageSkippedForConditional('test: Cortex-M 7 of 12') } } -def shard_run_test_Cortex_M_4_of_8() { +def shard_run_test_Cortex_M_8_of_12() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { @@ -4785,8 +4475,8 @@ def shard_run_test_Cortex_M_4_of_8() { withEnv([ 'PLATFORM=cortexm', 'TEST_STEP_NAME=test: Cortex-M', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=3', + 'TVM_NUM_SHARDS=12', + 'TVM_SHARD_INDEX=7', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -4844,11 +4534,11 @@ def shard_run_test_Cortex_M_4_of_8() { } } } else { - Utils.markStageSkippedForConditional('test: Cortex-M 4 of 8') + Utils.markStageSkippedForConditional('test: Cortex-M 8 of 12') } } -def shard_run_test_Cortex_M_5_of_8() { +def shard_run_test_Cortex_M_9_of_12() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { @@ -4859,8 +4549,8 @@ def shard_run_test_Cortex_M_5_of_8() { withEnv([ 'PLATFORM=cortexm', 'TEST_STEP_NAME=test: Cortex-M', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=4', + 'TVM_NUM_SHARDS=12', + 'TVM_SHARD_INDEX=8', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -4918,11 +4608,11 @@ def shard_run_test_Cortex_M_5_of_8() { } } } else { - Utils.markStageSkippedForConditional('test: Cortex-M 5 of 8') + Utils.markStageSkippedForConditional('test: Cortex-M 9 of 12') } } -def shard_run_test_Cortex_M_6_of_8() { +def shard_run_test_Cortex_M_10_of_12() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { @@ -4933,8 +4623,8 @@ def shard_run_test_Cortex_M_6_of_8() { withEnv([ 'PLATFORM=cortexm', 'TEST_STEP_NAME=test: Cortex-M', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=5', + 'TVM_NUM_SHARDS=12', + 'TVM_SHARD_INDEX=9', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -4992,11 +4682,11 @@ def shard_run_test_Cortex_M_6_of_8() { } } } else { - Utils.markStageSkippedForConditional('test: Cortex-M 6 of 8') + Utils.markStageSkippedForConditional('test: Cortex-M 10 of 12') } } -def shard_run_test_Cortex_M_7_of_8() { +def shard_run_test_Cortex_M_11_of_12() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { @@ -5007,8 +4697,8 @@ def shard_run_test_Cortex_M_7_of_8() { withEnv([ 'PLATFORM=cortexm', 'TEST_STEP_NAME=test: Cortex-M', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=6', + 'TVM_NUM_SHARDS=12', + 'TVM_SHARD_INDEX=10', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -5066,11 +4756,11 @@ def shard_run_test_Cortex_M_7_of_8() { } } } else { - Utils.markStageSkippedForConditional('test: Cortex-M 7 of 8') + Utils.markStageSkippedForConditional('test: Cortex-M 11 of 12') } } -def shard_run_test_Cortex_M_8_of_8() { +def shard_run_test_Cortex_M_12_of_12() { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { @@ -5081,8 +4771,8 @@ def shard_run_test_Cortex_M_8_of_8() { withEnv([ 'PLATFORM=cortexm', 'TEST_STEP_NAME=test: Cortex-M', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=7', + 'TVM_NUM_SHARDS=12', + 'TVM_SHARD_INDEX=11', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ @@ -5140,7 +4830,7 @@ def shard_run_test_Cortex_M_8_of_8() { } } } else { - Utils.markStageSkippedForConditional('test: Cortex-M 8 of 8') + Utils.markStageSkippedForConditional('test: Cortex-M 12 of 12') } } @@ -5300,71 +4990,50 @@ stage('Test') { 'unittest: GPU 3 of 3': { shard_run_unittest_GPU_3_of_3() }, - 'integration: CPU 1 of 10': { - shard_run_integration_CPU_1_of_10() - }, - 'integration: CPU 2 of 10': { - shard_run_integration_CPU_2_of_10() - }, - 'integration: CPU 3 of 10': { - shard_run_integration_CPU_3_of_10() - }, - 'integration: CPU 4 of 10': { - shard_run_integration_CPU_4_of_10() - }, - 'integration: CPU 5 of 10': { - shard_run_integration_CPU_5_of_10() - }, - 'integration: CPU 6 of 10': { - shard_run_integration_CPU_6_of_10() + 'integration: CPU 1 of 4': { + shard_run_integration_CPU_1_of_4() }, - 'integration: CPU 7 of 10': { - shard_run_integration_CPU_7_of_10() + 'integration: CPU 2 of 4': { + shard_run_integration_CPU_2_of_4() }, - 'integration: CPU 8 of 10': { - shard_run_integration_CPU_8_of_10() + 'integration: CPU 3 of 4': { + shard_run_integration_CPU_3_of_4() }, - 'integration: CPU 9 of 10': { - shard_run_integration_CPU_9_of_10() + 'integration: CPU 4 of 4': { + shard_run_integration_CPU_4_of_4() }, - 'integration: CPU 10 of 10': { - shard_run_integration_CPU_10_of_10() + 'python: i386 1 of 3': { + shard_run_python_i386_1_of_3() }, - 'python: i386 1 of 5': { - shard_run_python_i386_1_of_5() + 'python: i386 2 of 3': { + shard_run_python_i386_2_of_3() }, - 'python: i386 2 of 5': { - shard_run_python_i386_2_of_5() + 'python: i386 3 of 3': { + shard_run_python_i386_3_of_3() }, - 'python: i386 3 of 5': { - shard_run_python_i386_3_of_5() + 'test: Hexagon 1 of 8': { + shard_run_test_Hexagon_1_of_8() }, - 'python: i386 4 of 5': { - shard_run_python_i386_4_of_5() + 'test: Hexagon 2 of 8': { + shard_run_test_Hexagon_2_of_8() }, - 'python: i386 5 of 5': { - shard_run_python_i386_5_of_5() + 'test: Hexagon 3 of 8': { + shard_run_test_Hexagon_3_of_8() }, - 'test: Hexagon 1 of 7': { - shard_run_test_Hexagon_1_of_7() + 'test: Hexagon 4 of 8': { + shard_run_test_Hexagon_4_of_8() }, - 'test: Hexagon 2 of 7': { - shard_run_test_Hexagon_2_of_7() + 'test: Hexagon 5 of 8': { + shard_run_test_Hexagon_5_of_8() }, - 'test: Hexagon 3 of 7': { - shard_run_test_Hexagon_3_of_7() + 'test: Hexagon 6 of 8': { + shard_run_test_Hexagon_6_of_8() }, - 'test: Hexagon 4 of 7': { - shard_run_test_Hexagon_4_of_7() + 'test: Hexagon 7 of 8': { + shard_run_test_Hexagon_7_of_8() }, - 'test: Hexagon 5 of 7': { - shard_run_test_Hexagon_5_of_7() - }, - 'test: Hexagon 6 of 7': { - shard_run_test_Hexagon_6_of_7() - }, - 'test: Hexagon 7 of 7': { - shard_run_test_Hexagon_7_of_7() + 'test: Hexagon 8 of 8': { + shard_run_test_Hexagon_8_of_8() }, 'integration: aarch64 1 of 4': { shard_run_integration_aarch64_1_of_4() @@ -5378,17 +5047,14 @@ stage('Test') { 'integration: aarch64 4 of 4': { shard_run_integration_aarch64_4_of_4() }, - 'topi: GPU 1 of 4': { - shard_run_topi_GPU_1_of_4() + 'topi: GPU 1 of 3': { + shard_run_topi_GPU_1_of_3() }, - 'topi: GPU 2 of 4': { - shard_run_topi_GPU_2_of_4() + 'topi: GPU 2 of 3': { + shard_run_topi_GPU_2_of_3() }, - 'topi: GPU 3 of 4': { - shard_run_topi_GPU_3_of_4() - }, - 'topi: GPU 4 of 4': { - shard_run_topi_GPU_4_of_4() + 'topi: GPU 3 of 3': { + shard_run_topi_GPU_3_of_3() }, 'frontend: GPU 1 of 6': { shard_run_frontend_GPU_1_of_6() @@ -5420,29 +5086,41 @@ stage('Test') { 'frontend: aarch64 2 of 2': { shard_run_frontend_aarch64_2_of_2() }, - 'test: Cortex-M 1 of 8': { - shard_run_test_Cortex_M_1_of_8() + 'test: Cortex-M 1 of 12': { + shard_run_test_Cortex_M_1_of_12() + }, + 'test: Cortex-M 2 of 12': { + shard_run_test_Cortex_M_2_of_12() }, - 'test: Cortex-M 2 of 8': { - shard_run_test_Cortex_M_2_of_8() + 'test: Cortex-M 3 of 12': { + shard_run_test_Cortex_M_3_of_12() }, - 'test: Cortex-M 3 of 8': { - shard_run_test_Cortex_M_3_of_8() + 'test: Cortex-M 4 of 12': { + shard_run_test_Cortex_M_4_of_12() }, - 'test: Cortex-M 4 of 8': { - shard_run_test_Cortex_M_4_of_8() + 'test: Cortex-M 5 of 12': { + shard_run_test_Cortex_M_5_of_12() }, - 'test: Cortex-M 5 of 8': { - shard_run_test_Cortex_M_5_of_8() + 'test: Cortex-M 6 of 12': { + shard_run_test_Cortex_M_6_of_12() }, - 'test: Cortex-M 6 of 8': { - shard_run_test_Cortex_M_6_of_8() + 'test: Cortex-M 7 of 12': { + shard_run_test_Cortex_M_7_of_12() }, - 'test: Cortex-M 7 of 8': { - shard_run_test_Cortex_M_7_of_8() + 'test: Cortex-M 8 of 12': { + shard_run_test_Cortex_M_8_of_12() }, - 'test: Cortex-M 8 of 8': { - shard_run_test_Cortex_M_8_of_8() + 'test: Cortex-M 9 of 12': { + shard_run_test_Cortex_M_9_of_12() + }, + 'test: Cortex-M 10 of 12': { + shard_run_test_Cortex_M_10_of_12() + }, + 'test: Cortex-M 11 of 12': { + shard_run_test_Cortex_M_11_of_12() + }, + 'test: Cortex-M 12 of 12': { + shard_run_test_Cortex_M_12_of_12() }, 'test: RISC-V 1 of 1': { shard_run_test_RISC_V_1_of_1() @@ -5684,7 +5362,8 @@ stage('Test') { }, ) } -}/* +} +/* stage('Build packages') { parallel 'conda CPU': { node('CPU') { diff --git a/ci/jenkins/Test.groovy.j2 b/ci/jenkins/Test.groovy.j2 index 9e8c9ac28b01..4ed149da9be0 100644 --- a/ci/jenkins/Test.groovy.j2 +++ b/ci/jenkins/Test.groovy.j2 @@ -42,7 +42,7 @@ {% call(shard_index, num_shards) m.sharded_test_step( name="integration: CPU", node="CPU-SMALL", - num_shards=10, + num_shards=4, ws="tvm/integration-python-cpu", platform="cpu", docker_image="ci_cpu", @@ -58,7 +58,7 @@ {% call(shard_index, num_shards) m.sharded_test_step( name="python: i386", node="CPU-SMALL", - num_shards=5, + num_shards=3, ws="tvm/integration-python-i386", platform="i386", docker_image="ci_i386", @@ -85,7 +85,7 @@ platform="hexagon", docker_image="ci_hexagon", test_method_names=test_method_names, - num_shards=7, + num_shards=8, ) %} {{ m.download_artifacts(tag='hexagon', filenames=tvm_lib, folders=hexagon_api) }} add_hexagon_permissions() @@ -118,7 +118,7 @@ {% call(shard_index, num_shards) m.sharded_test_step( name="topi: GPU", node="GPU", - num_shards=4, + num_shards=3, ws="tvm/topi-python-gpu", platform="gpu", docker_image="ci_gpu", @@ -192,7 +192,7 @@ ws="tvm/test-cortexm", platform="cortexm", docker_image="ci_cortexm", - num_shards=8, + num_shards=12, test_method_names=test_method_names, ) %} {{ m.download_artifacts(tag='cortexm', filenames=tvm_lib, folders=microtvm_template_projects) }} @@ -316,4 +316,4 @@ stage('Test') { }, ) } -} \ No newline at end of file +} diff --git a/ci/jenkins/generate.py b/ci/jenkins/generate.py index 3ccdedc6d924..07bf4b5a8dad 100644 --- a/ci/jenkins/generate.py +++ b/ci/jenkins/generate.py @@ -31,6 +31,12 @@ JENKINSFILE = REPO_ROOT / "Jenkinsfile" +class Change: + IMAGES_ONLY = object() + NONE = object() + FULL = object() + + data = { "images": [ { @@ -83,7 +89,7 @@ def lines_without_generated_tag(content): ] -def is_changed_images_only(lines: List[str]) -> bool: +def change_type(lines: List[str]) -> Change: """ Return True if 'line' only edits an image tag or if 'line' is not a changed line in a diff @@ -101,7 +107,7 @@ def is_changed_images_only(lines: List[str]) -> bool: if len(diff_lines) == 0: # no changes made - return True + return Change.NONE for line in diff_lines: is_add = line.startswith("+") @@ -113,7 +119,7 @@ def is_changed_images_only(lines: List[str]) -> bool: ) if match is None: # matched a non-image line, quit early - return False + return Change.FULL if is_add: added_images.append(match.groups()[0]) @@ -121,7 +127,10 @@ def is_changed_images_only(lines: List[str]) -> bool: removed_images.append(match.groups()[0]) # make sure that the added image lines match the removed image lines - return len(added_images) > 0 and added_images == removed_images + if len(added_images) > 0 and added_images == removed_images: + return Change.IMAGES_ONLY + else: + return Change.FULL if __name__ == "__main__": @@ -156,9 +165,11 @@ def is_changed_images_only(lines: List[str]) -> bool: lines_without_generated_tag(content), lines_without_generated_tag(new_content) ) ] - if not args.force and is_changed_images_only(diff): + change = change_type(diff) + if not args.force and change == Change.IMAGES_ONLY or change == Change.NONE: + if change != Change.NONE: + print("Detected only Docker-image name changes, skipping timestamp update") new_content = new_content.replace(data["generated_time"], original_timestamp) - print("Detected only Docker-image name changed, skipping timestamp update") diff = "".join(diff) diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py index 65475d67f555..0b9f65540c34 100644 --- a/python/tvm/contrib/hexagon/pytest_plugin.py +++ b/python/tvm/contrib/hexagon/pytest_plugin.py @@ -210,7 +210,8 @@ def pytest_configure(config): def pytest_configure_node(node): # the master for each node fills slaveinput dictionary # which pytest-xdist will transfer to the subprocess - node.workerinput["device_adr"] = node.config.iplist.pop() + if node.config.iplist is not None: + node.workerinput["device_adr"] = node.config.iplist.pop() @pytest.fixture diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh index 895979293122..305f626d666c 100755 --- a/tests/scripts/setup-pytest-env.sh +++ b/tests/scripts/setup-pytest-env.sh @@ -47,6 +47,7 @@ function cleanup() { trap cleanup 0 function run_pytest() { + set -e local ffi_type="$1" shift local test_suite_name="$1" @@ -74,17 +75,14 @@ function run_pytest() { suite_name="${test_suite_name}-${current_shard}-${ffi_type}" - # Some test environments don't play well with parallelism - DEFAULT_PARALLELISM=2 - if [[ "${TEST_STEP_NAME:-default}" == "frontend: GPU"* ]] || [[ "${TEST_STEP_NAME:-default}" == "test: Hexagon"* ]]; then - DEFAULT_PARALLELISM=1 - fi + DEFAULT_PARALLELISM=1 - if [ ! "${extra_args[@]}" == *" -n"* ] && [! "${extra_args[@]}" == *" -dist"* ]; then + if [[ ! "${extra_args[*]}" == *" -n"* ]] && [[ ! "${extra_args[*]}" == *" -dist"* ]]; then extra_args+=("-n=$DEFAULT_PARALLELISM") fi exit_code=0 + set +e TVM_FFI=${ffi_type} python3 -m pytest \ -o "junit_suite_name=${suite_name}" \ "--junit-xml=${TVM_PYTEST_RESULT_DIR}/${suite_name}.xml" \ From abb2aa062fd240980844faa6e0ebcc2256a5191c Mon Sep 17 00:00:00 2001 From: "yin.changsheng" Date: Thu, 8 Sep 2022 11:28:36 +0800 Subject: [PATCH 123/704] [TIR] Add unroll_loop_with_partition_hint_no_interval attr in LoopPartitionConfig to unroll loop (#12631) [TIR] Add unroll_loop_with_partition_hint_no_interval attr in LoopPartitionConfig to unroll loop --- src/tir/transforms/loop_partition.cc | 28 ++++++-- .../test_tir_transform_loop_partition.py | 72 +++++++++++++------ 2 files changed, 71 insertions(+), 29 deletions(-) diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc index d410f8cfa471..e1445d29dacf 100644 --- a/src/tir/transforms/loop_partition.cc +++ b/src/tir/transforms/loop_partition.cc @@ -43,12 +43,16 @@ namespace tir { struct LoopPartitionConfigNode : public tvm::AttrsNode { bool partition_const_loop; bool no_unroll_loop_with_extent_one; + bool unroll_loop_with_partition_hint_no_interval; TVM_DECLARE_ATTRS(LoopPartitionConfigNode, "tir.transform.LoopPartitionConfig") { TVM_ATTR_FIELD(partition_const_loop).describe("Split constant loop").set_default(false); TVM_ATTR_FIELD(no_unroll_loop_with_extent_one) .describe("Don't unroll loops with extent 1") .set_default(false); + TVM_ATTR_FIELD(unroll_loop_with_partition_hint_no_interval) + .describe("Unroll loops with pragma_loop_partition_hint and no interval") + .set_default(false); } }; @@ -377,9 +381,11 @@ class ThreadPartitionInserter : public StmtMutator { // likely conditions class LoopPartitioner : public StmtMutator { public: - explicit LoopPartitioner(bool partition_const_loop, bool no_unroll_loop_with_extent_one) + explicit LoopPartitioner(bool partition_const_loop, bool no_unroll_loop_with_extent_one, + bool unroll_loop_with_partition_hint_no_interval) : selector(CandidateSelector(partition_const_loop)), - no_unroll_loop_with_extent_one_(no_unroll_loop_with_extent_one) {} + no_unroll_loop_with_extent_one_(no_unroll_loop_with_extent_one), + unroll_loop_with_partition_hint_no_interval_(unroll_loop_with_partition_hint_no_interval) {} Stmt VisitAndMutate(Stmt stmt) { selector(stmt); @@ -447,6 +453,7 @@ class LoopPartitioner : public StmtMutator { arith::Analyzer analyzer_; CandidateSelector selector; bool no_unroll_loop_with_extent_one_; + bool unroll_loop_with_partition_hint_no_interval_; }; // Returns an interval (in the first component) in which all the conditions @@ -587,6 +594,10 @@ Stmt LoopPartitioner::TryPartition(const Stmt& stmt, Var var, PrimExpr min, Prim }(); if (!opt_cond_value.has_value()) { + if (has_partition_hint_ && unroll_loop_with_partition_hint_no_interval_ && + analyzer_.CanProve(max - min > 0)) { + return For(var, min, max - min + 1, ForKind::kUnrolled, body); + } return Stmt(); } bool cond_value = opt_cond_value.value(); @@ -658,11 +669,11 @@ Stmt LoopPartitioner::TryPartition(const Stmt& stmt, Var var, PrimExpr min, Prim Stmt simplified_body = ConditionEliminator(cond_set, cond_value)(body); Stmt new_body = Substitute(simplified_body, {{Var{var}, var + body_begin}}); mid_stmt = MakeFor(stmt.get(), post_doubt_begin - body_begin, new_body); - + // Recurse until partitions is empty + mid_stmt = VisitAndMutate(mid_stmt); // Recurse for each non-empty subrange only if there are at least // two non-empty subranges if (pre_stmt.defined() || post_stmt.defined()) { - mid_stmt = VisitAndMutate(mid_stmt); if (pre_stmt.defined() && pre_stmt_recurse) { pre_stmt = VisitAndMutate(pre_stmt); } @@ -714,8 +725,10 @@ class RemoveLikelyTagsAndHints : public StmtExprMutator { } }; -Stmt LoopPartition(Stmt stmt, bool partition_const_loop, bool no_unroll_loop_with_extent_one) { - stmt = LoopPartitioner(partition_const_loop, no_unroll_loop_with_extent_one) +Stmt LoopPartition(Stmt stmt, bool partition_const_loop, bool no_unroll_loop_with_extent_one, + bool unroll_loop_with_partition_hint_no_interval) { + stmt = LoopPartitioner(partition_const_loop, no_unroll_loop_with_extent_one, + unroll_loop_with_partition_hint_no_interval) .VisitAndMutate(std::move(stmt)); stmt = RemoveLikelyTagsAndHints()(std::move(stmt)); return stmt; @@ -731,7 +744,8 @@ Pass LoopPartition() { cfg = AttrsWithDefaultValues(); } n->body = LoopPartition(std::move(n->body), cfg.value()->partition_const_loop, - cfg.value()->no_unroll_loop_with_extent_one); + cfg.value()->no_unroll_loop_with_extent_one, + cfg.value()->unroll_loop_with_partition_hint_no_interval); return f; }; return CreatePrimFuncPass(pass_func, 0, "tir.LoopPartition", {}); diff --git a/tests/python/unittest/test_tir_transform_loop_partition.py b/tests/python/unittest/test_tir_transform_loop_partition.py index b6e8d92f8d39..23a0064ee6ff 100644 --- a/tests/python/unittest/test_tir_transform_loop_partition.py +++ b/tests/python/unittest/test_tir_transform_loop_partition.py @@ -619,26 +619,54 @@ def test_condition_mutually_exclusive(): assert tvm.ir.structural_equal(mod["main"], partitioned_concat_3) +def test_loop_partition_unroll_hint(): + @T.prim_func + def main(A: T.Buffer[150528, "int8"], B: T.Buffer[25088, "int8"]) -> None: + T.preflattened_buffer(A, [1, 3, 224, 224], "int8", data=A.data) + T.preflattened_buffer(B, [1, 224, 7, 16], "int8", data=B.data) + for ax0 in T.serial( + 112, + annotations={"pragma_loop_partition_hint": True}, + ): + for ax1, ax2, ax3 in T.grid(224, 7, 16): + if 3 <= ax0 * 2 + ax2 and ax0 * 2 + ax2 < 227 and ax3 < 3: + B[ax1 * 112 + ax2 * 16 + ax3] = A[ax3 * 50176 + ax1 * 224 + ax0 * 2 + ax2 - 3] + + @T.prim_func + def partitioned_main(A: T.Buffer[150528, "int8"], B: T.Buffer[25088, "int8"]) -> None: + T.preflattened_buffer(A, [1, 3, 224, 224], dtype="int8", data=A.data) + T.preflattened_buffer(B, [1, 224, 7, 16], dtype="int8", data=B.data) + # body + for ax1, ax2, ax3 in T.grid(224, 7, 16): + if 3 <= ax2 and ax3 < 3: + B[ax1 * 112 + ax2 * 16 + ax3] = A[ax3 * 50176 + ax1 * 224 + ax2 - 3] + for ax1, ax2, ax3 in T.grid(224, 7, 16): + if 1 <= ax2 and ax3 < 3: + B[ax1 * 112 + ax2 * 16 + ax3] = A[ax3 * 50176 + ax1 * 224 + ax2 - 1] + for ax0, ax1, ax2, ax3 in T.grid(109, 224, 7, 16): + if ax3 < 3: + B[ax1 * 112 + ax2 * 16 + ax3] = A[ax3 * 50176 + ax1 * 224 + ax0 * 2 + ax2 + 1] + for ax1, ax2, ax3 in T.grid(224, 7, 16): + if ax2 < 5 and ax3 < 3: + B[ax1 * 112 + ax2 * 16 + ax3] = A[ax3 * 50176 + ax1 * 224 + ax2 + 219] + + mod = tvm.ir.module.IRModule.from_expr(main) + with tvm.transform.PassContext( + config={ + "tir.LoopPartition": { + "partition_const_loop": True, + "unroll_loop_with_partition_hint_no_interval": True, + } + } + ): + mod = tvm.tir.transform.LowerOpaqueBlock()(mod) + mod = tvm.tir.transform.FlattenBuffer()(mod) + mod = tvm.tir.transform.LoopPartition()(mod) + mod = tvm.tir.transform.UnrollLoop()(mod) + mod = tvm.tir.transform.RemoveNoOp()(mod) + mod = tvm.tir.transform.Simplify()(mod) + assert tvm.ir.structural_equal(mod["main"], partitioned_main) + + if __name__ == "__main__": - test_basic() - test_const_loop() - test_multi_loop() - test_multi_if() - test_thread_axis() - test_vectorize() - test_condition() - test_condition_EQ() - test_thread_axis2() - test_everything_during_deduction() - test_single_likely() - test_multi_likely() - test_oneD_pool() - test_cce_loop_1() - test_cce_loop_2() - test_cce_loop_3() - test_conv_tiling() - test_double_splitting_with_indivisible_factors() - test_multilevel_splitting_with_indivisble_factors() - test_simple_rfactor() - test_explicit_partition_hint() - test_condition_mutually_exclusive() + tvm.testing.main() From 6be04d72c2a2d65b791a43a40167101ce4064ff2 Mon Sep 17 00:00:00 2001 From: Siva Date: Thu, 8 Sep 2022 10:28:17 +0530 Subject: [PATCH 124/704] =?UTF-8?q?[OpenCLML]=20CLML=20Profiling=20fixes?= =?UTF-8?q?=20corresponding=20to=20OpenCL=20Timer=20recent=20=E2=80=A6=20(?= =?UTF-8?q?#12711)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [OpenCLML] CLML Profiling fixes corresponding to OpenCL Timer recent changes. * [OpenCLML] Review comments. * * review comment --- src/runtime/contrib/clml/clml_runtime.cc | 161 ++++++++---------- .../contrib/test_clml/infrastructure.py | 6 +- .../python/contrib/test_clml/test_network.py | 4 +- tests/python/contrib/test_clml/test_ops.py | 2 +- 4 files changed, 80 insertions(+), 93 deletions(-) diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc index 7966c0e78b2d..da41442ef91d 100644 --- a/src/runtime/contrib/clml/clml_runtime.cc +++ b/src/runtime/contrib/clml/clml_runtime.cc @@ -131,37 +131,14 @@ class CLMLRuntime : public JSONRuntimeBase { // Setup CLML Context cl_int result = 0; - // Initialize Context and Command Queue - result = clGetPlatformIDs(1, &platform, NULL); - ICHECK(result == CL_SUCCESS) << "clGetPlatformIDs:" << result; + workspace = cl::OpenCLWorkspace::Global(); + workspace->Init(); + tentry = workspace->GetThreadEntry(); - uint32_t num_devices = 0; - result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); - ICHECK(result == CL_SUCCESS && num_devices == 1) << "clGetDeviceIDs:" << result; - - result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); - ICHECK(device_id && result == CL_SUCCESS) << "clGetDeviceIDs:" << result; - - if (!ExtensionStringPresent(device_id)) { + if (!ExtensionStringPresent()) { LOG(WARNING) << "CLML Runtime Init: Qualcomm extn not present.\n"; return; } - - // Reuse the OpenCl work space from TVM Device API. - auto func = tvm::runtime::Registry::Get("device_api.opencl"); - ICHECK(func != nullptr) << "Cannot find OpenCL device_api in registry"; - auto device_api = static_cast(((*func)()).operator void*()); - this->context = device_api->context; - bool queue_found = false; - for (size_t i = 0; i < device_api->devices.size(); ++i) { - if (device_api->devices[i] == device_id) { - this->queue = device_api->queues[i]; - this->evts = &(device_api->events[i]); - queue_found = true; - } - } - ICHECK(queue_found != false) << "Device queue not found in OpenCL Workspace"; - // Query and Get CLML Interface static const cl_uint MAX_VERSIONS = 256; cl_int majorVersions[MAX_VERSIONS]; @@ -220,8 +197,8 @@ class CLMLRuntime : public JSONRuntimeBase { cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM) { cl_int result = 0; cl_event evt = NULL; - result = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(queue, data, layout, tensor->tensor, - tensor->memory, + result = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(workspace->GetQueue(tentry->device), data, + layout, tensor->tensor, tensor->memory, 0, // n waitlist NULL, // waitlist &evt); // event @@ -233,8 +210,8 @@ class CLMLRuntime : public JSONRuntimeBase { cl_int result = 0; cl_event readEvent = NULL; // Read the output tensor - result = h_ClmlIntf->clEnqueueReadMLTensorDataQCOM(queue, tensor->tensor, tensor->memory, data, - layout, + result = h_ClmlIntf->clEnqueueReadMLTensorDataQCOM(workspace->GetQueue(tentry->device), + tensor->tensor, tensor->memory, data, layout, 0, // n waitlist NULL, // waitlist &readEvent); // event @@ -253,6 +230,8 @@ class CLMLRuntime : public JSONRuntimeBase { */ void Run() override { cl_int result = 0; + cl_command_queue queue = workspace->GetQueue(tentry->device); + std::vector& evts = workspace->GetEventQueue(tentry->device); for (size_t i = 0; i < input_nodes_.size(); ++i) { auto nid = input_nodes_[i]; uint32_t eid = EntryID(nid, 0); @@ -286,22 +265,26 @@ class CLMLRuntime : public JSONRuntimeBase { } for (size_t i = 0; i < this->layer_.function.size(); ++i) { - this->evts->resize(this->evts->size() + 1); - cl_event* evt = &(this->evts->back()); - result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i], - this->layer_.descriptorSet, 0, NULL, evt); + if (getenv("CLML_PROFILING")) { + evts.resize(evts.size() + 1); + cl_event* evt = &(evts.back()); + result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i], + this->layer_.descriptorSet, 0, NULL, evt); + } else { + result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i], + this->layer_.descriptorSet, 0, NULL, NULL); + } ICHECK(result == CL_SUCCESS) << "clEnqueueMLOpQCOM:" << result; } if (getenv("CLML_PROFILING")) { cl_ulong start, end; cl_ulong duration = 0; - clWaitForEvents(1, &(this->evts->back())); + clWaitForEvents(1, &(evts.back())); for (size_t i = 0; i < this->layer_.layer_names.size(); ++i) { - clGetEventProfilingInfo((*this->evts)[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), - &start, nullptr); - clGetEventProfilingInfo((*this->evts)[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, + clGetEventProfilingInfo(evts[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr); + clGetEventProfilingInfo(evts[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, nullptr); duration += (end - start); LOG(WARNING) << "Layer:" << this->layer_.layer_names[i] << " Duration:" << (end - start); } @@ -425,7 +408,7 @@ class CLMLRuntime : public JSONRuntimeBase { JSONGraphNode node = it->second.second; void* node_data = nullptr; - allocateTensorMemory(h_ClmlIntf, context, tensor_desc); + allocateTensorMemory(h_ClmlIntf, workspace->context, tensor_desc); if (node.GetOpType() == "const") { node_data = data_entry_[EntryID(it->first, 0)]->data; @@ -449,8 +432,9 @@ class CLMLRuntime : public JSONRuntimeBase { LOG(WARNING) << "CLML Tunning In Progress:"; for (size_t i = 0; i < this->layer_.function.size(); ++i) { LOG(WARNING) << "CLML Tunning:" << i; - result = h_ClmlIntf->clTuneMLOpQCOM(queue, this->layer_.function[i], - this->layer_.descriptorSet, this->tuning_cache, NULL); + result = h_ClmlIntf->clTuneMLOpQCOM(workspace->GetQueue(tentry->device), + this->layer_.function[i], this->layer_.descriptorSet, + this->tuning_cache, NULL); ICHECK(result == CL_SUCCESS) << "clTuneMLOpQCOM:" << result; } @@ -499,10 +483,13 @@ class CLMLRuntime : public JSONRuntimeBase { uint32_t n, c, h, w; }; - bool ExtensionStringPresent(cl_device_id device_id) { + bool ExtensionStringPresent(void) { cl_int result = 0; - + if (workspace->platform_id == nullptr) { + return 0; + } size_t reqd_size = 0; + cl_device_id device_id = workspace->devices[workspace->GetThreadEntry()->device.device_id]; result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, NULL, &reqd_size); ICHECK(reqd_size > 0u && result == CL_SUCCESS) << "clGetDeviceInfo:" << result; @@ -525,7 +512,7 @@ class CLMLRuntime : public JSONRuntimeBase { cl_ml_tensor_desc_qcom desc = { dtype, layout, dims.n, dims.c, dims.h, dims.w, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, { 0 }}; CLMLInterfaceV2QCOM* clmlIntf = reinterpret_cast(pClmlIntf); - result = clmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &tensor); + result = clmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &tensor); ICHECK(tensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result; (void)result; return tensor; @@ -538,10 +525,11 @@ class CLMLRuntime : public JSONRuntimeBase { cl_mem buffer = NULL; CLMLInterfaceV2QCOM* clmlIntf = reinterpret_cast(pClmlIntf); - result = clmlIntf->clGetMLTensorMemorySizeQCOM(context, pTensorMemDesc->tensor, &size); + result = + clmlIntf->clGetMLTensorMemorySizeQCOM(workspace->context, pTensorMemDesc->tensor, &size); ICHECK(result == CL_SUCCESS) << "clGetMLTensorMemorySizeQCOM:" << result; - buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &result); + buffer = clCreateBuffer(workspace->context, CL_MEM_READ_WRITE, size, NULL, &result); ICHECK(result == CL_SUCCESS) << "clCreateBuffer:" << result; pTensorMemDesc->memory = buffer; @@ -592,7 +580,8 @@ class CLMLRuntime : public JSONRuntimeBase { cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); auto tensor_dsc = std::make_shared(); - tensor_dsc->tensor = DeviceMakeCLMLTensor(h_ClmlIntf, context, dims, layout, cl_dtype); + tensor_dsc->tensor = + DeviceMakeCLMLTensor(h_ClmlIntf, workspace->context, dims, layout, cl_dtype); return tensor_dsc; } @@ -703,7 +692,8 @@ class CLMLRuntime : public JSONRuntimeBase { } else { cl_ml_tensor_desc_qcom desc = {}; desc.num_dimensions = CL_TENSOR_UNUSED_QCOM; - result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &layer_.unusedTensor); + result = + h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &layer_.unusedTensor); ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result; bias->tensor = layer_.unusedTensor; } @@ -723,13 +713,13 @@ class CLMLRuntime : public JSONRuntimeBase { if (!has_bn) { if (!has_act) { result = h_ClmlIntf->clCreateMLOpConvolutionForwardQCOM( - context, 0, &conv_desc, input->tensor, weight->tensor, bias->tensor, output->tensor, - &op, NULL); + workspace->context, 0, &conv_desc, input->tensor, weight->tensor, bias->tensor, + output->tensor, &op, NULL); ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result; } else { result = h_ClmlIntf->clCreateMLOpFusedConvolutionActivationForwardQCOM( - context, 0, &conv_desc, &act_desc, input->tensor, weight->tensor, bias->tensor, NULL, - output->tensor, &op, tuning_cache); + workspace->context, 0, &conv_desc, &act_desc, input->tensor, weight->tensor, + bias->tensor, NULL, output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result; } layer_.func_ins.push_back(input); @@ -753,13 +743,13 @@ class CLMLRuntime : public JSONRuntimeBase { CL_ARITHMETIC_MODE_FP32_QCOM}; if (!has_act) { result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormForwardQCOM( - context, 0, &conv_desc, &bn_desc, input->tensor, weight->tensor, bias->tensor, - output->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op, - tuning_cache); + workspace->context, 0, &conv_desc, &bn_desc, input->tensor, weight->tensor, + bias->tensor, output->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, + bn_bias->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result; } else { result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM( - context, 0, &conv_desc, &bn_desc, &act_desc, input->tensor, weight->tensor, + workspace->context, 0, &conv_desc, &bn_desc, &act_desc, input->tensor, weight->tensor, bias->tensor, output->tensor, NULL, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op, tuning_cache); @@ -790,12 +780,13 @@ class CLMLRuntime : public JSONRuntimeBase { cl_ml_tensor_desc_qcom desc = {}; desc.num_dimensions = CL_TENSOR_UNUSED_QCOM; - result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &layer_.unusedTensor); + result = + h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &layer_.unusedTensor); ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result; - result = h_ClmlIntf->clCreateMLOpActivationForwardQCOM(context, 0, &act_desc, input->tensor, - layer_.unusedTensor, output->tensor, &op, - tuning_cache); + result = h_ClmlIntf->clCreateMLOpActivationForwardQCOM(workspace->context, 0, &act_desc, + input->tensor, layer_.unusedTensor, + output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Activation Error:" << result; layer_.func_ins.push_back(input); @@ -834,8 +825,8 @@ class CLMLRuntime : public JSONRuntimeBase { CL_ARITHMETIC_MODE_FP32_QCOM}; result = h_ClmlIntf->clCreateMLOpBatchNormForwardQCOM( - context, 0, &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, - bn_bias->tensor, output->tensor, &op, tuning_cache); + workspace->context, 0, &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor, + bn_scale->tensor, bn_bias->tensor, output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Batchnorm Error:" << result; layer->function.push_back(op); @@ -872,12 +863,13 @@ class CLMLRuntime : public JSONRuntimeBase { cl_ml_tensor_desc_qcom desc = {}; desc.num_dimensions = CL_TENSOR_UNUSED_QCOM; - result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &layer_.unusedTensor); + result = + h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &layer_.unusedTensor); ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result; - result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(context, 0, &pool_desc, input->tensor, - layer_.unusedTensor, output->tensor, &op, - tuning_cache); + result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(workspace->context, 0, &pool_desc, + input->tensor, layer_.unusedTensor, + output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result; layer_.func_ins.push_back(input); @@ -904,8 +896,8 @@ class CLMLRuntime : public JSONRuntimeBase { CL_SOFTMAX_MODE_INSTANCE_QCOM, CL_ARITHMETIC_MODE_FP32_QCOM}; - result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(context, 0, &softmax_desc, input->tensor, - output->tensor, &op, tuning_cache); + result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(workspace->context, 0, &softmax_desc, + input->tensor, output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "SoftMax Error:" << result; layer_.func_ins.push_back(input); @@ -946,8 +938,8 @@ class CLMLRuntime : public JSONRuntimeBase { {clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 0, 0, 0, 0}, CL_ARITHMETIC_MODE_FP32_QCOM}; - result = h_ClmlIntf->clCreateMLOpPadQCOM(context, 0, &pad_desc, input->tensor, output->tensor, - &op, tuning_cache); + result = h_ClmlIntf->clCreateMLOpPadQCOM(workspace->context, 0, &pad_desc, input->tensor, + output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Pad Error:" << result; layer_.func_ins.push_back(input); @@ -968,8 +960,8 @@ class CLMLRuntime : public JSONRuntimeBase { auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); auto output = MakeCLMLTensorFromJSONNode(node); - result = h_ClmlIntf->clCreateMLOpReshapeQCOM(context, 0, input->tensor, output->tensor, &op, - tuning_cache); + result = h_ClmlIntf->clCreateMLOpReshapeQCOM(workspace->context, 0, input->tensor, + output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result; layer_.func_ins.push_back(input); @@ -1004,13 +996,13 @@ class CLMLRuntime : public JSONRuntimeBase { auto output = MakeCLMLTensorFromJSONNode(node); if (has_bias) { - result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(context, 0, &fc_desc, input->tensor, - weight->tensor, bias->tensor, - output->tensor, &op, tuning_cache); + result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM( + workspace->context, 0, &fc_desc, input->tensor, weight->tensor, bias->tensor, + output->tensor, &op, tuning_cache); } else { - result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(context, 0, &fc_desc, input->tensor, - weight->tensor, NULL, output->tensor, &op, - tuning_cache); + result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(workspace->context, 0, &fc_desc, + input->tensor, weight->tensor, NULL, + output->tensor, &op, tuning_cache); } ICHECK(op && result == CL_SUCCESS) << "Fully Connected Error:" << result; @@ -1039,8 +1031,8 @@ class CLMLRuntime : public JSONRuntimeBase { {{a_min}, CL_FLOAT}, CL_ARITHMETIC_MODE_FP32_QCOM}; - result = h_ClmlIntf->clCreateMLOpClipQCOM(context, 0, &clip_desc, input->tensor, output->tensor, - &op, tuning_cache); + result = h_ClmlIntf->clCreateMLOpClipQCOM(workspace->context, 0, &clip_desc, input->tensor, + output->tensor, &op, tuning_cache); ICHECK(op && result == CL_SUCCESS) << "Clip Error:" << result; layer_.func_ins.push_back(input); @@ -1056,11 +1048,8 @@ class CLMLRuntime : public JSONRuntimeBase { CachedLayer layer_; // CLML Context CLMLInterfaceV2QCOM* h_ClmlIntf = NULL; - cl_platform_id platform = NULL; - cl_context context = NULL; - cl_device_id device_id = NULL; - cl_command_queue queue = NULL; - std::vector* evts; + cl::OpenCLWorkspace* workspace = NULL; + cl::OpenCLThreadEntry* tentry = NULL; cl_ml_tuningcache_qcom tuning_cache = NULL; bool is_tuning_run; char* tuning_file; diff --git a/tests/python/contrib/test_clml/infrastructure.py b/tests/python/contrib/test_clml/infrastructure.py index 19901d733e4c..0cf76079e8fb 100644 --- a/tests/python/contrib/test_clml/infrastructure.py +++ b/tests/python/contrib/test_clml/infrastructure.py @@ -73,11 +73,11 @@ class Device: connection_type = "tracker" host = "localhost" - port = 9090 + port = 9150 target = "opencl" target_host = "llvm -mtriple=aarch64-linux-gnu" - device_key = "" - cross_compile = "" + device_key = "android" + cross_compile = "aarch64-linux-android-g++" def __init__(self): """Keep remote device for lifetime of object.""" diff --git a/tests/python/contrib/test_clml/test_network.py b/tests/python/contrib/test_clml/test_network.py index d89676f10e3a..405f5782ff2e 100644 --- a/tests/python/contrib/test_clml/test_network.py +++ b/tests/python/contrib/test_clml/test_network.py @@ -22,8 +22,7 @@ from tvm import relay import tvm -from test_clml.infrastructure import skip_runtime_test, build_and_run -from test_clml.infrastructure import Device +from test_clml.infrastructure import skip_runtime_test, build_and_run, Device def _build_and_run_network(mod, params, inputs, data, device, atol, rtol): @@ -86,7 +85,6 @@ def get_model(): mobilenet = MobileNet( include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000 ) - mobilenet.load_weights("mobilenet_1_0_224_tf.h5") inputs = {mobilenet.input_names[0]: ((1, 3, 224, 224), "float32")} data = {} diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py index 63f5bc168fd0..13f49d152714 100644 --- a/tests/python/contrib/test_clml/test_ops.py +++ b/tests/python/contrib/test_clml/test_ops.py @@ -212,5 +212,5 @@ def test_batchnorm(): if __name__ == "__main__": - # test_conv2d() + test_conv2d() test_batchnorm() From 62bdc91b1aee1c88dc128273abb637174d0e2071 Mon Sep 17 00:00:00 2001 From: Gavin Uberti Date: Wed, 7 Sep 2022 22:16:46 -0700 Subject: [PATCH 125/704] Add Arm DSP implementation of Depthwise Conv2D (#12448) --- python/tvm/relay/op/strategy/arm_cpu.py | 22 ++ python/tvm/topi/arm_cpu/depthwise_conv2d.py | 19 ++ .../arm_cpu/mprofile/dsp/depthwise_conv2d.py | 245 ++++++++++++++++++ .../dsp/micro_kernel/quad_channel_convolve.py | 180 +++++++++++++ .../strategy/arm_cpu/test_depthwise_conv2d.py | 25 ++ 5 files changed, 491 insertions(+) create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index ba28b6c7c31c..2d9ef99ba8a6 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -235,6 +235,28 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.arm_cpu", ) + + # Optimized special case depthwiseConv2D operation. Requires a 3x3 kernel, a + # NHWC layout, a HWOI kernel layout (which we rearrange), no dilation, int8 inputs, + # int32 output, the same number of input and output channels, and for that channel + # count to be divisible by 4. Additional work could remove these restrictions. + + elif ( + target.features.has_dsp + and kernel.shape[0] == kernel.shape[1] == 3 + and dilation_w == dilation_h == 1 + and kernel.shape[3] == 1 # channel_multiplier == 1 + and data.dtype == "int8" + and out_type.dtype == "int32" + and data.shape[3] % 4 == 0 + and (padding != "SAME" or data.shape[1] % stride_h == data.shape[2] % stride_w == 0) + ): + strategy.add_implementation( + wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nhwc_dsp), + wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc_dsp), + name="depthwise_conv2d_nhwc_dsp.arm_cpu", + ) + else: logger.warning("depthwise_conv2d with layout NHWC is not optimized for arm cpu.") strategy.add_implementation( diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py index c21480724ae4..333db3d5e014 100644 --- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py +++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py @@ -28,6 +28,11 @@ from .tensor_intrin import smlal_int16_int32 from .arm_utils import is_aarch64_arm +from .mprofile.dsp.depthwise_conv2d import ( + depthwise_conv2d_nhwc_dsp_compute, + depthwise_conv2d_nhwc_dsp_schedule, +) + @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu") def depthwise_conv2d_nchw(_, data, kernel, strides, padding, dilation, out_dtype): @@ -699,3 +704,17 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, last): s[kernel_vec].parallel(co) return s + + +@autotvm.register_topi_compute("depthwise_conv2d_nhwc_dsp.arm_cpu") +def depthwise_conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute conv2d_nhwc with v7e-m DSP instructions.""" + return depthwise_conv2d_nhwc_dsp_compute( + cfg, data, kernel, strides, padding, dilation, out_dtype + ) + + +@autotvm.register_topi_schedule("depthwise_conv2d_nhwc_dsp.arm_cpu") +def schedule_depthwise_conv2d_nhwc_dsp(cfg, outs): + """Create schedule for conv2d_nhwc_dsp""" + return depthwise_conv2d_nhwc_dsp_schedule(cfg, outs) diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py new file mode 100644 index 000000000000..162bf65a21f9 --- /dev/null +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/depthwise_conv2d.py @@ -0,0 +1,245 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""ARM Cortex-M DSP schedule for depthwise_conv2d""" + +import random +import string + +from tvm import te +from tvm.topi.utils import traverse_inline, get_const_tuple +from tvm.topi.nn.pad import pad +from tvm import tir + +from .micro_kernel.quad_channel_convolve import ( + intrin_quad_channel_convolve, + quad_channel_convolve_impl, +) + +# For depthwise_conv2d, kernels are normally given in HWOI format, +# which when input_channels = output channels, we will call HWC. +# This is bad, as we want "related" parts of the kernel to be next +# to each other, so we can use __SMLAD later. +# +# Consider a 3x3 int8 kernel with no bias vector, with eight +# channels. Let us specify entries in the kernel as H_W_C - i.e. +# where 0_2_3 represents the rightmost position in the first row +# of channel 4/8 (4 because of zero indexing). Each [ ] represents +# a 32-bit integer. We currently store the kernel as: +# +# 0 ................................31 +# [ 0_0_0 || 0_0_1 || 0_0_2 || 0_0_3 ] [ 0_0_4 || 0_0_5 || 0_0_6 || 0_0_7 ] +# [ 0_1_0 || 0_1_1 || 0_1_2 || 0_1_3 ] [ 0_1_4 || 0_1_5 || 0_1_6 || 0_1_7 ] +# [ 0_2_0 || 0_2_1 || 0_2_2 || 0_2_3 ] [ 0_2_4 || 0_2_5 || 0_2_6 || 0_2_7 ] +# [ 1_0_0 || 1_0_1 || 1_0_2 || 1_0_3 ] [ 1_0_4 || 1_0_5 || 1_0_6 || 1_0_7 ] +# [ 1_1_0 || 1_1_1 || 1_1_2 || 1_1_3 ] [ 1_1_4 || 1_1_5 || 1_1_6 || 1_1_7 ] +# [ 1_2_0 || 1_2_1 || 1_2_2 || 1_2_3 ] [ 1_2_4 || 1_2_5 || 1_2_6 || 1_2_7 ] +# [ 2_0_0 || 2_0_1 || 2_0_2 || 2_0_3 ] [ 2_0_4 || 2_0_5 || 2_0_6 || 2_0_7 ] +# [ 2_1_0 || 2_1_1 || 2_1_2 || 2_1_3 ] [ 2_1_4 || 2_1_5 || 2_1_6 || 2_1_7 ] +# [ 2_2_0 || 2_2_1 || 2_2_2 || 2_2_3 ] [ 2_2_4 || 2_2_5 || 2_2_6 || 2_2_7 ] +# +# Let 0x00 be all zeros. We rearrange into: +# +# 0 ................................31 +# [ 0_0_0 || 0_0_1 || 0_1_0 || 0_1_1 ] [ 0_0_2 || 0_0_3 || 0_1_2 || 0_1_3 ] +# [ 0_2_0 || 0_2_1 || 1_0_0 || 1_0_1 ] [ 0_2_2 || 0_2_3 || 1_0_2 || 1_0_3 ] +# [ 1_1_0 || 1_1_1 || 1_2_0 || 1_2_1 ] [ 1_1_2 || 1_1_3 || 1_2_2 || 1_2_3 ] +# [ 2_0_0 || 2_0_1 || 2_1_0 || 2_1_1 ] [ 2_0_2 || 2_0_3 || 2_1_2 || 2_1_3 ] +# [ 2_2_0 || 2_2_1 || 0x000 || 0x000 ] [ 2_2_2 || 2_2_3 || 0x000 || 0x000 ] +# [ 0_0_4 || 0_0_5 || 0_1_4 || 0_1_5 ] [ 0_0_6 || 0_0_7 || 0_1_6 || 0_1_7 ] +# [ 0_2_4 || 0_2_5 || 1_0_4 || 1_0_5 ] [ 0_2_6 || 0_2_7 || 1_0_6 || 1_0_7 ] +# [ 1_1_4 || 1_1_5 || 1_2_4 || 1_2_5 ] [ 1_1_6 || 1_1_7 || 1_2_6 || 1_2_7 ] +# [ 2_0_4 || 2_0_5 || 2_1_4 || 2_1_5 ] [ 2_0_6 || 2_0_7 || 2_1_6 || 2_1_7 ] +# [ 2_2_4 || 2_2_5 || 0x000 || 0x000 ] [ 2_2_6 || 2_2_7 || 0x000 || 0x000 ] +# +# This saves us six operations comapred to the original ordering, as we +# do not need halfword packing instructions. +# +# This kernel re-arranging function will be used for 3x3 kernels (as that +# is all this DSP implementation currently supports) but would work with +# any M*N kernel such that M*N is odd. + + +def _rearrange_kernel(kernel): + # Kernel must be HWC format. + kernel_h, kernel_w, channels, _ = get_const_tuple(kernel.shape) + assert channels % 4 == 0 + + # This restriction could be removed by only using tir.if_then_else to add padding + # zeros if (kernel_w * kernel_h) % 2 == 1, and filling completely otherwise. + assert (kernel_w * kernel_h) % 2 == 1 + + def fcompute(c_o, pos, c_i): + channel = (2 * (pos % 2)) + (c_i % 2) + (4 * c_o) + true_pos_index = 2 * (pos // 2) + (c_i // 2) + + return tir.if_then_else( + true_pos_index < (kernel_h * kernel_w), + kernel[true_pos_index // kernel_w, true_pos_index % kernel_w, channel, 0], + tir.const(0, "int8"), + ) + + return te.compute( + (channels // 4, kernel_h * kernel_w + 1, 4), + fcompute, + name="packed_kernel", + ) + + +def depthwise_conv2d_nhwc_dsp_compute(_cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute function for v7e-m DSP instructions of DepthwiseConv2D. Has a lot of requirements + for use - if not all apply, the fallback implementation will be used instead.""" + assert isinstance(strides, int) or len(strides) == 2 + assert isinstance(dilation, int) or len(dilation) == 2 + + if isinstance(strides, int): + stride_h = stride_w = strides + else: + stride_h, stride_w = strides + + # We do not support dilation currently. It would be possible, but it would require + # modifying the way the kernel is packed. Gnarly. + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + assert dilation_h == dilation_w == 1 + + batch_size, height, width, channels = data.shape + kernel_h, kernel_w, _, _ = kernel.shape + + # We require that the number of channels be divisible by 4. This restriction could + # be removed with strip mining if people cared. + assert channels % 4 == 0 + + # We don't support different numbers of input and output channels. + assert channels == kernel.shape[2] + assert kernel.shape[3] == 1 + + # We take in int8 as our dtype, but we spit out int32. This is because we cannot + # round until we compute activations. + assert out_dtype == "int32" + + # This can pretty easily be generalized in the future. Likely worth doing, and this + # function was written to make doing so easy. Should only require adding more calls + # to QUAD_CHANNEL_REARRANGE_SUM. + assert kernel_w == kernel_h == 3 + + # Padding the data requires COPYING THE ENTIRE INPUT TENSOR, which + # is slow and bad. We should really implement a strip mining + # routine to avoid this, but TVM has terrible support for that. + + if padding == "SAME": + # This assumption makes the logic easier. Could be removed with work. + assert height % stride_h == width % stride_w == 0 + + output_h = height // stride_h + output_w = width // stride_w + + # This padding behavior is consistent with other TVM depthwise_conv2d schedules. However it + # differs from the TensorFlow, which only pads the bottom right if stride > 1. This probably + # brings down accuracy slightly for models imported from TFLite. + pad_down = 1 if stride_h == 1 else 0 + pad_right = 1 if stride_w == 1 else 0 + + padded_data = pad( + data, + [0, kernel_h // 2, kernel_w // 2, 0], + [0, pad_down, pad_right, 0], + name="padded_data", + ) + + elif padding == "VALID": + assert height > kernel_h and width > kernel_w + output_h = (height - kernel_h) // stride_h + 1 + output_w = (width - kernel_w) // stride_w + 1 + padded_data = data + + elif isinstance(padding, tuple): + if len(padding) == 2: + pad_up, pad_down = padding[0] + pad_left, pad_right = padding[1] + else: + pad_up, pad_left, pad_down, pad_right = padding + + output_h = (height - kernel_h + pad_up + pad_down) // stride_h + 1 + output_w = (width - kernel_w + pad_left + pad_right) // stride_w + 1 + padded_data = pad( + data, + [0, pad_up, pad_left, 0], + [0, pad_down, pad_right, 0], + name="padded_data", + ) + + else: + raise RuntimeError() + _, padded_h, padded_w, _ = padded_data.shape + + packed_kernel = _rearrange_kernel(kernel) + kh_i = te.reduce_axis((0, kernel_h), name="kh_i") + kw_i = te.reduce_axis((0, kernel_w), name="kw_i") + return te.compute( + (batch_size, output_h, output_w, channels), + lambda h, i, j, k: te.sum( + padded_data[h, (i * stride_h) + kh_i, (j * stride_w) + kw_i, k].astype("int32") + * packed_kernel[ + k // 4, + (2 * ((3 * kh_i + kw_i) // 2)) + ((k % 4) // 2), + (2 * ((kh_i + kw_i) % 2)) + (k % 2), + ].astype("int32"), + axis=(kh_i, kw_i), + ), + name="depthwise_conv2d", + tag=f"depthwise_conv2d_nhwc_{padded_h}_{padded_w}_dsp", + ) + + +def depthwise_conv2d_nhwc_dsp_schedule(_cfg, outs): + + """Schedule function for v7e-m DSP instructions of conv2d.""" + schedule = te.create_schedule([x.op for x in outs]) + + def _callback(op): + if "depthwise_conv2d_nhwc" not in op.tag: + return + + # extract tensors + output = op.output(0) + padded_data = output.op.input_tensors[0] + packed_kernel = output.op.input_tensors[1] + kernel = packed_kernel.op.input_tensors[0] + + _, _, padded_w, channels = padded_data.shape + kernel_h, kernel_w, _, _ = kernel.shape + suffix = "".join(random.choices(string.ascii_uppercase, k=8)) + + b_ax, y_ax, x_ax, c_ax = schedule[output].op.axis + ky_ax, kx_ax = schedule[output].op.reduce_axis + c_ax_o, c_ax_i = schedule[output].split(c_ax, factor=4) + schedule[output].reorder(b_ax, c_ax_o, y_ax, x_ax, ky_ax, kx_ax, c_ax_i) + + quad_channel_convolve = intrin_quad_channel_convolve( + padded_w, channels, kernel_h, kernel_w, suffix + ) + schedule[output].tensorize(ky_ax, quad_channel_convolve) + schedule[output].pragma( + b_ax, + "import_c", + quad_channel_convolve_impl(padded_w, channels, kernel_h, kernel_w, suffix), + ) + + traverse_inline(schedule, outs[-1].op, _callback) + return schedule diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py new file mode 100644 index 000000000000..960ef8fadc0e --- /dev/null +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/quad_channel_convolve.py @@ -0,0 +1,180 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""This is a special intrinsic used for depthwise convolution using Cortex-M DSP instructions +(v7e-m). It takes as inputs an int8 HWC data tensor and an int8 CHWc kernel. This intrinsic "lays" +the kernel on top of the data tensors starting from a given pointer, performs signed sixteen-bit +multiplies on each pair of values, and sums all the products in an int32 accumlator. This process is +repeated four times giving four int32 outputs - one per channel.""" + +import textwrap + +from tvm import te, tir + + +def intrin_quad_channel_convolve(tensor_w, channels, kernel_h, kernel_w, suffix): + """Defines a v7e-m DSP-accelerated four-channel convolution.""" + data_slice = te.placeholder((kernel_h, kernel_w, 4), name="a", dtype="int8") + + if kernel_h * kernel_w % 2 == 1: + kernel_length = kernel_h * kernel_w + 1 + else: + kernel_length = kernel_h * kernel_w + kernel_slice = te.placeholder((kernel_length, 4), name="b", dtype="int8") + + kh_i = te.reduce_axis((0, kernel_h), name="kh_i") + kw_i = te.reduce_axis((0, kernel_w), name="kw_i") + + output_slice = te.compute( + (4,), + lambda k: te.sum( + data_slice[kh_i, kw_i, k].astype("int32") + * kernel_slice[ + (2 * ((3 * kh_i + kw_i) // 2)) + ((k % 4) // 2), + (2 * ((kh_i + kw_i) % 2)) + (k % 2), + ].astype("int32"), + axis=(kh_i, kw_i), + ), + name="c", + ) + + data_buf = tir.decl_buffer( + data_slice.shape, + data_slice.dtype, + name="data", + offset_factor=1, + strides=[tensor_w * channels, channels, 1], + ) + kernel_buf = tir.decl_buffer( + kernel_slice.shape, kernel_slice.dtype, name="kernel", offset_factor=1, strides=[4, 1] + ) + output_buf = tir.decl_buffer( + output_slice.shape, output_slice.dtype, name="output", offset_factor=1, strides=[1] + ) + + def intrin_func(ins, outs): + builder = tir.ir_builder.create() + builder.emit( + tir.call_extern( + "int32", + f"kernel_convolve_w{tensor_w}_c{channels}_kh{kernel_h}_kw{kernel_w}_{suffix}", + outs[0].access_ptr("w"), + ins[0].access_ptr("r"), + ins[1].access_ptr("r"), + ) + ) + return builder.get() + + return te.decl_tensor_intrin( + output_slice.op, + intrin_func, + binds={data_slice: data_buf, kernel_slice: kernel_buf, output_slice: output_buf}, + ) + + +def quad_channel_convolve_impl(tensor_w, channels, kernel_h, kernel_w, suffix): + """Emits C code for quad_channel_convolve. Note that while intrin_quad_channel_convolve supports + any kernel size, this function only supports 3x3 kernels (this could be fixed with work).""" + assert kernel_h == kernel_w == 3 + + return textwrap.dedent( + ( + f""" + #include + #include + + // __SXTB16(_ROR(X, Y)) is combined into one assembly instruction + + #define TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP( \ + arranged_kernel, \ + tensor_v0_c3210, tensor_v1_c3210, \ + sum0, sum1, sum2, sum3) {{ \ + \ + uint32_t tensor_v0_c20 = __SXTB16(tensor_v0_c3210); \ + uint32_t tensor_v0_c31 = __SXTB16(__ROR(tensor_v0_c3210, 8)); \ + uint32_t tensor_v1_c20 = __SXTB16(tensor_v1_c3210); \ + uint32_t tensor_v1_c31 = __SXTB16(__ROR(tensor_v1_c3210, 8)); \ + \ + uint32_t kernel_v1c1_v1c0_v0c1_v0c0 = *arranged_kernel++; \ + uint32_t kernel_v1c3_v1c2_v0c3_v0c2 = *arranged_kernel++; \ + \ + uint32_t kernel_v10_c0 = __SXTB16(kernel_v1c1_v1c0_v0c1_v0c0); \ + uint32_t kernel_v10_c1 = __SXTB16(__ROR(kernel_v1c1_v1c0_v0c1_v0c0, 8)); \ + uint32_t kernel_v10_c2 = __SXTB16(kernel_v1c3_v1c2_v0c3_v0c2); \ + uint32_t kernel_v10_c3 = __SXTB16(__ROR(kernel_v1c3_v1c2_v0c3_v0c2, 8)); \ + \ + uint32_t tensor_v10_c0 = __PKHBT(tensor_v0_c20, tensor_v1_c20, 16); \ + uint32_t tensor_v10_c1 = __PKHBT(tensor_v0_c31, tensor_v1_c31, 16); \ + uint32_t tensor_v10_c2 = __PKHTB(tensor_v1_c20, tensor_v0_c20, 16); \ + uint32_t tensor_v10_c3 = __PKHTB(tensor_v1_c31, tensor_v0_c31, 16); \ + \ + sum_c0 = __SMLAD(tensor_v10_c0, kernel_v10_c0, sum_c0); \ + sum_c1 = __SMLAD(tensor_v10_c1, kernel_v10_c1, sum_c1); \ + sum_c2 = __SMLAD(tensor_v10_c2, kernel_v10_c2, sum_c2); \ + sum_c3 = __SMLAD(tensor_v10_c3, kernel_v10_c3, sum_c3); \ + }} + + /* We do four channels at once to get this speed boost. */ + #ifdef __cplusplus + extern "C" + #endif + int32_t kernel_convolve_w{tensor_w}_c{channels}_kh{kernel_h}_kw{kernel_w}_{suffix}( + uint32_t *out, + uint32_t *tensor, + uint32_t *packed_kernel) {{ + + uint32_t sum_c0 = 0; + uint32_t sum_c1 = 0; + uint32_t sum_c2 = 0; + uint32_t sum_c3 = 0; + + TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP( + packed_kernel, + *tensor, + *(tensor + {channels // 4}), + sum_c0, sum_c1, sum_c2, sum_c3) + TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP( + packed_kernel, + *(tensor + {(2) * channels // 4}), + *(tensor + {tensor_w * (channels // 4)}), + sum_c0, sum_c1, sum_c2, sum_c3) + TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP( + packed_kernel, + *(tensor + {(tensor_w + 1) * (channels // 4)}), + *(tensor + {(tensor_w + 2) * (channels // 4)}), + sum_c0, sum_c1, sum_c2, sum_c3) + TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP( + packed_kernel, + *(tensor + {(2 * tensor_w) * (channels // 4)}), + *(tensor + {(2 * tensor_w + 1) * (channels // 4)}), + sum_c0, sum_c1, sum_c2, sum_c3) + TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP( + packed_kernel, + *(tensor + {(2 * tensor_w + 2) * (channels // 4)}), + 0, + sum_c0, sum_c1, sum_c2, sum_c3) + + out[0] = sum_c0; + out[1] = sum_c1; + out[2] = sum_c2; + out[3] = sum_c3; + return 0; + }} + + #undef TVMGEN_QUAD_CHANNEL_REARRANGE_SUM_DSP + """ + ) + ) diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py index ee0d51c321f7..18c5082f2a0c 100644 --- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py +++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py @@ -147,5 +147,30 @@ class TestDepthwiseConv2d_NHWC_HWOI(BasicDepthwiseConv2dTests): schedule_name = tvm.testing.parameter("depthwise_conv2d_nhwc.generic") +class TestDepthwiseConv2d_NHWC_HWOI_DSP(BasicDepthwiseConv2dTests): + """This test is for depthwise_conv2d_nhwc_dsp.arm_cpu schedule.""" + + data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters( + # The LLVM implementation doesn't support "SAME" and "VALID" padding, + # so padding must be explicitly specified. + # Depthwise_conv2d parameters from MobileNetV1 0.25x + ((1, 48, 48, 8), (3, 3), 8, (1, 1), 1, 1), + ((1, 48, 48, 16), (3, 3), 16, (2, 2), (1, 1, 0, 0), 1), + ((1, 24, 24, 32), (3, 3), 32, (1, 1), 1, 1), + ((1, 24, 24, 32), (3, 3), 32, (2, 2), (1, 1, 0, 0), 1), + ((1, 12, 12, 64), (3, 3), 64, (1, 1), 1, 1), + ((1, 12, 12, 64), (3, 3), 64, (2, 2), (1, 1, 0, 0), 1), + ((1, 6, 6, 128), (3, 3), 128, (1, 1), 1, 1), + ((1, 6, 6, 128), (3, 3), 128, (2, 2), (1, 1, 0, 0), 1), + ((1, 3, 3, 256), (3, 3), 256, (1, 1), 1, 1), + # Asymmetric height and width + ((1, 25, 5, 64), (3, 3), 64, (1, 1), 1, 1), + ) + data_layout = tvm.testing.parameter("NHWC") + dtype = tvm.testing.parameter("int8") + kernel_layout = tvm.testing.parameter("HWOI") + schedule_name = tvm.testing.parameter("depthwise_conv2d_nhwc_dsp.arm_cpu") + + if __name__ == "__main__": tvm.testing.main() From cd99ca64cb2416219215745c1d478b86776378ed Mon Sep 17 00:00:00 2001 From: Gavin Uberti Date: Thu, 8 Sep 2022 01:54:10 -0700 Subject: [PATCH 126/704] [Relay] Change when int8 operations are converted to int16 on Arm (#12671) Currently, Relay QNN uses its `helper_no_fast_int8_hw_legalization` to convert most `int8` convolution and dense operations into `int16` ones on Arm. This currently occurs on ARM chips except for `v8.2a` chips with `dotprod` support. However, this behavior means that `int8` operations are replaced with `int16` ones on Cortex-M chips. On these chips `int16` is substantially slower, as while it saves a few sign extension operations, it doubles the amount of memory loads we need to perform. This PR changes when `helper_no_fast_int8_hw_legalization` is used on Arm, and instead makes **not** doing this replacement the standard. We will only do this replacement if we are on a chip with ASIMD support but without `v8.2a` and `dotprod`. This ensures that Cortex-M microcontrollers do not have `int8` operations turned into `int16` ones. I have also verified that this does, in fact, improve performance for some common models. For example, MobileNet_v1_0.25 on the Cortex-M4 saw a 10% performance improvement, compared to before this change. Accuracy does not seem to be affected. --- python/tvm/relay/qnn/op/legalizations.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py index 2fcdaf362a22..9bc6efdad00f 100644 --- a/python/tvm/relay/qnn/op/legalizations.py +++ b/python/tvm/relay/qnn/op/legalizations.py @@ -424,7 +424,8 @@ def is_aarch64_arm(): @qnn_conv2d_legalize.register("arm_cpu") def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types): - # ARM prefers the dtypes to be same. + target = tvm.target.Target.current(allow_none=False) + has_asimd = is_aarch64_arm() or "+neon" in target.mattr is_depthwise = relay.op.strategy.is_depthwise_conv2d( types[0].shape, attrs["data_layout"], @@ -432,18 +433,23 @@ def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types): attrs["kernel_layout"], attrs["groups"], ) - use_int8_on_arm = (not is_depthwise) and is_aarch64_arm() and attrs["data_layout"] == "NHWC" - if use_int8_on_arm or is_fast_int8_on_arm(): - return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d) - return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d) + use_int8_on_arm = (not is_depthwise) and attrs["data_layout"] == "NHWC" + has_dotprod = is_fast_int8_on_arm() + other_options = use_int8_on_arm or has_dotprod + if has_asimd and not other_options: + return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d) + # ARM prefers the dtypes to be same. + return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d) @qnn_dense_legalize.register("arm_cpu") def _qnn_dense_legalize_arm_cpu(attrs, inputs, types): + target = tvm.target.Target.current(allow_none=False) + has_asimd = is_aarch64_arm() or "+neon" in target.mattr + if has_asimd and not is_fast_int8_on_arm(): + return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.dense) # ARM prefers the dtypes to be same. - if is_fast_int8_on_arm(): - return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.dense) - return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.dense) + return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.dense) ########################## From 2d36e460079f6920ab97a6b2de31fe678895ce62 Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Thu, 8 Sep 2022 14:49:55 +0100 Subject: [PATCH 127/704] [CI][AArch64] Mark tests to be skipped due to torch crash (#12730) Some integration tests are not being run on CI due to the configuration of the machine with onnx and torch not calling the integration tests script. This patch skips two more tests failing with the error message below: ``` "OSError: /.../torch/lib/libgomp-d22c30c5.so.1: cannot allocate memory in static TLS block" ``` --- tests/python/driver/tvmc/test_frontends.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py index 1ccac7696fcc..c1a3be67c208 100644 --- a/tests/python/driver/tvmc/test_frontends.py +++ b/tests/python/driver/tvmc/test_frontends.py @@ -237,6 +237,10 @@ def test_load_model___wrong_language__to_onnx(tflite_mobilenet_v1_1_quant): tvmc.load(tflite_mobilenet_v1_1_quant, model_format="onnx") +@pytest.mark.skipif( + platform.machine() == "aarch64", + reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673", +) def test_load_model__pth(pytorch_resnet18): # some CI environments wont offer torch, so skip in case it is not present pytest.importorskip("torch") @@ -432,6 +436,10 @@ def test_import_tensorflow_friendly_message(pb_mobilenet_v1_1_quant, monkeypatch _ = tvmc.frontends.load_model(pb_mobilenet_v1_1_quant, model_format="pb") +@pytest.mark.skipif( + platform.machine() == "aarch64", + reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673", +) def test_import_torch_friendly_message(pytorch_resnet18, monkeypatch): monkeypatch.setattr("importlib.import_module", mock_error_on_name("torch")) From 4f4bc26607712adfed539e21916cddc3dc2dd601 Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Thu, 8 Sep 2022 14:50:14 +0100 Subject: [PATCH 128/704] [MetaSchedule] Mark two tests as xfail (#12733) This patch marks two tests as xfail for further investigation: * test_meta_schedule_integration_extract_from_resnet_with_filter_func * test_meta_schedule_integration_extract_from_resnet --- tests/python/unittest/test_meta_schedule_integration.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py index 69522831ee55..366a2e4887ed 100644 --- a/tests/python/unittest/test_meta_schedule_integration.py +++ b/tests/python/unittest/test_meta_schedule_integration.py @@ -64,6 +64,7 @@ def test_meta_schedule_dynamic_loop_extent(): assert not extracted_tasks +@pytest.mark.xfail(strict=True, reason="See https://github.com/apache/tvm/issues/12732") @requires_torch def test_meta_schedule_integration_extract_from_resnet(): mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224]) @@ -198,6 +199,7 @@ def test_meta_schedule_integration_extract_from_bert_base(): assert expected_shape == shape, t.task_name +@pytest.mark.xfail(strict=True, reason="See https://github.com/apache/tvm/issues/12732") @requires_torch def test_meta_schedule_integration_extract_from_resnet_with_filter_func(): @register_func("relay.backend.tir_converter.remove_purely_spatial", override=True) From ed630122c281f47493e2941a7dc471e201904587 Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Thu, 8 Sep 2022 14:50:36 +0100 Subject: [PATCH 129/704] [Test] Add tvm.testing.requires_libtorch (#12737) Create a specific test dependency to map to USE_LIBTORCH, which is disabled by deafult, and is independent from torch being installed on the underlying machine, so it causes problems in machines that have torch installed but TVM is build with USE_LIBTORCH OFF. Mark tests.python.contrib.test_libtorch_ops.test_backend with this new decorator. --- python/tvm/testing/utils.py | 3 +++ tests/python/contrib/test_libtorch_ops.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py index 5b70eb06911b..37a27a4213e9 100644 --- a/python/tvm/testing/utils.py +++ b/python/tvm/testing/utils.py @@ -945,6 +945,9 @@ def _any_gpu_exists(): # Mark a test as requiring Arm(R) Ethos(TM)-N to run requires_ethosn = Feature("ethosn", "Arm(R) Ethos(TM)-N", cmake_flag="USE_ETHOSN") +# Mark a test as requiring libtorch to run +requires_libtorch = Feature("libtorch", "LibTorch", cmake_flag="USE_LIBTORCH") + # Mark a test as requiring Hexagon to run requires_hexagon = Feature( "hexagon", diff --git a/tests/python/contrib/test_libtorch_ops.py b/tests/python/contrib/test_libtorch_ops.py index 28ae39c329f5..2bfb78b407aa 100644 --- a/tests/python/contrib/test_libtorch_ops.py +++ b/tests/python/contrib/test_libtorch_ops.py @@ -19,6 +19,7 @@ import tvm.relay from tvm.relay.op.contrib import torchop +from tvm.testing import requires_libtorch import_torch_error = None @@ -30,6 +31,7 @@ @pytest.mark.skipif(torch is None, reason=f"PyTorch is not available: {import_torch_error}") +@requires_libtorch def test_backend(): @torch.jit.script def script_fn(x, y): From b2bd434ef944315a6f241803ac03c59c9aaa9847 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Thu, 8 Sep 2022 08:02:42 -0700 Subject: [PATCH 130/704] [TIR] Handle axis_separators during FlattenBuffer (#12652) * [TIR] Moved tir.FlattenBuffer to occur before tir.LowerOpaqueBlock For buffers with more than one physical axis, the `axis_separators` are required in order to know which groups of logical axes to fuse into each physical axis. The implementation in `tir.FlattenBuffer` assumed that all buffers were being flattened to a single physical axis. Because `tir.LowerOpaqueBlock` replaces the `BlockNode::alloc_buffers` with `Allocate` nodes, `tir.FlattenBuffer` no longer has access to the axis separators and performs inconsistent flattening for `Allocate` as opposed to `BufferLoad`/`BufferStore`. This was introduced in https://github.com/apache/tvm/pull/12172, which decoupled the lowering/flattening steps. The commit reorders the `tir.FlattenBuffer` to occur before `tir.LowerOpaqueBlock`, to make use of the axis separators. Any `Allocate` nodes that exist at that point (e.g. from hand-written schedules) are still flattened to 1-d physical buffers, but the `BlockNode::alloc_buffers` are flattened according to the axis separators. * Add unit test to validate non-flat memory after tvm.lower * Explicitly write T.reads for test on BufferRegion updates * Update incorrect docstring for test * Use DeclBuffer information in FlattenBuffer The DeclBuffer node can be inserted during LowerOpaqueBlock, then provide the missing Buffer information required to flatten the allocation. * Use T.allocate in unit tests With the insertion of `DeclBuffer` nodes, `LowerOpaqueBlock` no longer needs to be before `FlattenBuffer`, and has been moved back to its original position. Revering the tests to use `T.allocate` instead of `T.alloc_buffer` more closely represents the functions as they are being lowered. * Fix usage of T.decl_buffer in updated tests * Update LowerOpaqueBuffer to expect the DeclBuffer nodes * Strip DeclBuffer annotation in FlattenBuffer The DeclBuffer annotations aren't yet supported in all passes. This restricts them to being introduced in LowerOpaqueBuffer, then immediately removed in FlattenBuffer. * Strip out all DeclBuffer nodes in FlattenBuffer * Update unit tests to remove expectation of DeclBuffer nodes --- src/tir/transforms/flatten_buffer.cc | 123 ++++- src/tir/transforms/lower_opaque_block.cc | 1 + .../test_tir_transform_flatten_buffer.py | 502 ++++++++++-------- .../test_tir_transform_lower_opaque_block.py | 22 +- 4 files changed, 417 insertions(+), 231 deletions(-) diff --git a/src/tir/transforms/flatten_buffer.cc b/src/tir/transforms/flatten_buffer.cc index 22aef136bcff..5441120491c6 100644 --- a/src/tir/transforms/flatten_buffer.cc +++ b/src/tir/transforms/flatten_buffer.cc @@ -21,6 +21,7 @@ * \file flatten_buffer.cc */ +#include #include #include @@ -53,6 +54,34 @@ class BufferFlattener : public StmtExprMutator { } } + Stmt VisitStmt_(const BlockNode* op) final { + ICHECK_EQ(op->match_buffers.size(), 0) + << "Unexpected MatchBufferRegion found during tir.transform.FlattenBuffer. " + << "All MatchBufferRegion should be removed in tir.transform.LowerMatchBuffer."; + + Block block = GetRef(op); + + Array alloc_buffers = op->alloc_buffers; + alloc_buffers.MutateByApply([this](Buffer buf) { return GetFlattenedBuffer(buf); }); + if (!alloc_buffers.same_as(op->alloc_buffers)) { + block.CopyOnWrite()->alloc_buffers = alloc_buffers; + } + + Array reads = op->reads; + reads.MutateByApply([this](BufferRegion region) { return MutateBufferRegion(region); }); + if (!reads.same_as(op->reads)) { + block.CopyOnWrite()->reads = reads; + } + + Array writes = op->writes; + writes.MutateByApply([this](BufferRegion region) { return MutateBufferRegion(region); }); + if (!writes.same_as(op->writes)) { + block.CopyOnWrite()->writes = writes; + } + + return StmtExprMutator::VisitStmt_(block.get()); + } + Stmt VisitStmt_(const AllocateNode* op) final { Allocate alloc = Downcast(StmtExprMutator::VisitStmt_(op)); // TODO(Lunderberg): Move the handling of boolean into a @@ -61,18 +90,70 @@ class BufferFlattener : public StmtExprMutator { auto writer = alloc.CopyOnWrite(); writer->dtype = DataType::Int(8); } - // Handle multi-dimension allocations + if (alloc->extents.size() == 1) { - return std::move(alloc); - } else { - Array flat_extent(static_cast(1), 1); - for (size_t i = 0; i < alloc->extents.size(); i++) { - flat_extent.Set(0, flat_extent[0] * alloc->extents[i]); + // No flattening required for buffers that are already flat + + // TODO(rfc-70): Keep the DeclBuffer node as-is. Stripping it + // out in the current implementation as not all lowering passes + // support DeclBuffer. + if (auto* decl_buffer = alloc->body.as()) { + alloc.CopyOnWrite()->body = std::move(decl_buffer->body); } - auto n = alloc.CopyOnWrite(); - n->extents = flat_extent; + return std::move(alloc); } + + if (auto* decl_buffer = alloc->body.as(); + decl_buffer && decl_buffer->buffer->data.same_as(alloc->buffer_var)) { + // N-d buffer, use the DeclBuffer inside to determine how it + // should be flattened. + auto& buffer = decl_buffer->buffer; + bool matching_buffer = [&]() { + if (alloc->dtype != buffer->dtype) { + return false; + } + if (alloc->extents.size() != buffer->shape.size()) { + return false; + } + ExprDeepEqual expr_equal; + for (size_t i = 0; i < alloc->extents.size(); i++) { + if (!expr_equal(alloc->extents[i], buffer->shape[i])) { + return false; + } + } + return true; + }(); + + if (matching_buffer) { + Buffer flattened = GetFlattenedBuffer(buffer); + + auto n = alloc.CopyOnWrite(); + // TODO(rfc-70): Update the DeclBuffer node instead of + // stripping it out. Stripping it out in the current + // implementation as not all lowering passes support + // DeclBuffer. + // + // n->body = DeclBuffer(flattened, std::move(decl_buffer->body)); + n->body = std::move(decl_buffer->body); + n->extents = flattened->shape; + return std::move(alloc); + } else { + ICHECK(decl_buffer->buffer->axis_separators.empty()) + << "DeclBuffer node doesn't match Allocate extents, but also shouldn't be " + "flattened to 1-d physical memory"; + } + } + + // Fallback, this is an allocation without a matching DeclBuffer + PrimExpr flat_extent = 1; + for (const auto& dim : alloc->extents) { + flat_extent *= dim; + } + + auto n = alloc.CopyOnWrite(); + n->extents = {flat_extent}; + return std::move(alloc); } Buffer GetFlattenedBuffer(Buffer buf) { @@ -141,6 +222,32 @@ class BufferFlattener : public StmtExprMutator { return node; } + BufferRegion MutateBufferRegion(BufferRegion region) { + Buffer orig_buf = region->buffer; + Buffer flattened_buf = GetFlattenedBuffer(orig_buf); + if (flattened_buf.same_as(orig_buf)) { + return region; + } + + Array min_values; + Array max_values; + for (const auto& range : region->region) { + min_values.push_back(range->min); + max_values.push_back(range->min + range->extent - 1); + } + + Array flattened_min = orig_buf->ElemOffset(min_values); + Array flattened_max = orig_buf->ElemOffset(max_values); + + Array flattened_ranges; + ICHECK_EQ(flattened_min.size(), flattened_max.size()); + for (size_t i = 0; i < flattened_min.size(); i++) { + flattened_ranges.push_back(Range(flattened_min[i], flattened_max[i] + 1)); + } + + return BufferRegion(flattened_buf, flattened_ranges); + } + /*! \brief Map of buffers being remapped. */ std::unordered_map buffer_remap_; diff --git a/src/tir/transforms/lower_opaque_block.cc b/src/tir/transforms/lower_opaque_block.cc index a4655ebbaed5..ce74fdc4c17b 100644 --- a/src/tir/transforms/lower_opaque_block.cc +++ b/src/tir/transforms/lower_opaque_block.cc @@ -57,6 +57,7 @@ class OpaqueBlockLower : public StmtExprMutator { new_shape.Set(i, buffer->strides[i - 1] / buffer->strides[i]); } } + body = DeclBuffer(buffer, std::move(body)); body = Allocate(buffer->data, buffer->dtype, new_shape, const_true(), std::move(body)); } // Step 4. Handle annotations, block annotations are not preserved by default. diff --git a/tests/python/unittest/test_tir_transform_flatten_buffer.py b/tests/python/unittest/test_tir_transform_flatten_buffer.py index 4cdf71889eee..870208499e7a 100644 --- a/tests/python/unittest/test_tir_transform_flatten_buffer.py +++ b/tests/python/unittest/test_tir_transform_flatten_buffer.py @@ -20,223 +20,307 @@ from tvm.script import tir as T -def _check(original, transformed): - func = original - mod = tvm.IRModule.from_expr(func) - mod = tvm.tir.transform.FlattenBuffer()(mod) - mod = tvm.tir.transform.Simplify()(mod) - tvm.ir.assert_structural_equal(mod["main"], transformed, True) - - -@T.prim_func -def elementwise_func(a: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (16, 16), "float32") - C = T.match_buffer(c, (16, 16), "float32") - for i in T.serial(0, 16): - B_new_data = T.allocate([1, 16], "float32", "global") - B_new = T.buffer_decl(shape=[1, 16], dtype="float32", data=B_new_data) - for j in T.serial(0, 16): - B_new[0, j] = A[i, j] + 1.0 - for j in T.serial(0, 16): - C[i, j] = B_new[0, j] * 2.0 - - -@T.prim_func -def flattened_elementwise_func(a: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, 256, "float32") - C = T.match_buffer(c, 256, "float32") - T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data) - T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data) - for i in T.serial(0, 16): - B_new_data = T.allocate([16], "float32", "global") - B_new = T.buffer_decl(shape=[16], dtype="float32", data=B_new_data) - for j in T.serial(0, 16): - B_new[j] = A[((i * 16) + j)] + 1.0 - for j in T.serial(0, 16): - C[((i * 16) + j)] = B_new[j] * 2.0 - - -@T.prim_func -def gpu_func(a: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (16, 16), "float32") - C = T.match_buffer(c, (16, 16), "float32") - - i0 = T.env_thread("blockIdx.x") - i1 = T.env_thread("threadIdx.x") - i2 = T.env_thread("vthread") - - T.launch_thread(i0, 4) - T.launch_thread(i1, 2) - T.launch_thread(i2, 2) - B_data = T.allocate([1, 16], "float32", "local") - B = T.buffer_decl(shape=[1, 16], dtype="float32", data=B_data, scope="local") - for j in range(0, 16): - B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0 - for j in range(0, 16): - C[i0 * 4 + i1 * 2 + i2, j] = B[0, j] * 2.0 - - -@T.prim_func -def flattened_gpu_func(a: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, 256, "float32") - C = T.match_buffer(c, 256, "float32") - T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data) - T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data) - - i0 = T.env_thread("blockIdx.x") - i1 = T.env_thread("threadIdx.x") - i2 = T.env_thread("vthread") - - T.launch_thread(i0, 4) - T.launch_thread(i1, 2) - T.launch_thread(i2, 2) - B_data = T.allocate([16], "float32", "local") - B = T.buffer_decl(shape=[16], dtype="float32", data=B_data, scope="local") - for j in range(0, 16): - B[j] = A[i0 * 64 + i1 * 32 + i2 * 16 + j] + 1.0 - for j in range(0, 16): - C[i0 * 64 + i1 * 32 + i2 * 16 + j] = B[j] * 2.0 - - -@T.prim_func -def symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> None: - A = T.match_buffer(a, (n, m), "float32") - C = T.match_buffer(c, (n, m), "float32") - - for i in range(0, n): - B_data = T.allocate([m], "float32", "global") - B = T.buffer_decl(shape=[m], dtype="float32", data=B_data) - for j in range(0, m): - B[j] = A[i, j] + 1.0 - for j in range(0, m): - C[i, j] = B[j] * 2.0 - - -@T.prim_func -def flattened_symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> None: - A = T.match_buffer(a, n * m, "float32") - C = T.match_buffer(c, n * m, "float32") - T.preflattened_buffer(A, (n, m), "float32", data=A.data) - T.preflattened_buffer(C, (n, m), "float32", data=C.data) - - for i in range(0, n): - B_data = T.allocate([m], "float32", "global") - B = T.buffer_decl(shape=[m], dtype="float32", data=B_data) - for j in range(0, m): - B[j] = A[i * m + j] + 1.0 - for j in range(0, m): - C[i * m + j] = B[j] * 2.0 - - -@T.prim_func -def multi_alloc_func(a: T.handle, d: T.handle) -> None: - A = T.match_buffer(a, (4, 32), "float32") - D = T.match_buffer(d, (4, 32), "float32") - - for i, j in T.grid(4, 32): - B_data = T.allocate((4, 32), "float32", scope="global") - B = T.buffer_decl(shape=(4, 32), dtype="float32", data=B_data) - C_data = T.allocate((4, 32), "float32", scope="global") - C = T.buffer_decl(shape=(4, 32), dtype="float32", data=C_data) - B[i, j] = A[i, j] + 1.0 - C[i, j] = A[i, j] + B[i, j] - D[i, j] = C[i, j] * 2.0 - - -@T.prim_func -def flattened_multi_alloc_func(a: T.handle, d: T.handle) -> None: - A = T.match_buffer(a, 128, "float32") - D = T.match_buffer(d, 128, "float32") - T.preflattened_buffer(A, (4, 32), "float32", data=A.data) - T.preflattened_buffer(D, (4, 32), "float32", data=D.data) - - for i, j in T.grid(4, 32): - B_data = T.allocate([128], "float32", "global") - B = T.buffer_decl(shape=[128], dtype="float32", data=B_data) - C_data = T.allocate([128], "float32", "global") - C = T.buffer_decl(shape=[128], dtype="float32", data=C_data) - B[i * 32 + j] = A[i * 32 + j] + 1.0 - C[i * 32 + j] = A[i * 32 + j] + B[i * 32 + j] - D[i * 32 + j] = C[i * 32 + j] * 2.0 - - -@T.prim_func -def strided_buffer_func(a: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (16, 16), "float32") - C = T.match_buffer(c, (16, 16), "float32") - for i0 in T.serial(4): - B_data = T.allocate([4, 17], "float32", "global") - B = T.buffer_decl(shape=[4, 17], dtype="float32", data=B_data) - B_1 = T.buffer_decl([4, 16], dtype="float32", data=B.data, strides=[17, 1]) - for i1, j in T.grid(4, 16): - B_1[i1, j] = A[i0 * 4 + i1, j] + 1.0 - for i1, j in T.grid(4, 16): - C[i0 * 4 + i1, j] = B_1[i1, j] * 2.0 - - -@T.prim_func -def flattened_strided_buffer_func(a: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, (256,), "float32") - C = T.match_buffer(c, (256,), "float32") - T.preflattened_buffer(A, [16, 16], dtype="float32", data=A.data) - T.preflattened_buffer(C, [16, 16], dtype="float32", data=C.data) - for i0 in T.serial(0, 4): - B_new_data = T.allocate([68], "float32", "global") - B_new = T.buffer_decl(shape=[68], dtype="float32", data=B_new_data) - for i1 in T.serial(0, 4): - for j in T.serial(0, 16): - B_new[i1 * 17 + j] = A[i0 * 64 + i1 * 16 + j] + 1.0 - for i1 in T.serial(0, 4): - for j in T.serial(0, 16): - C[i0 * 64 + i1 * 16 + j] = B_new[i1 * 17 + j] * 2.0 - - -@T.prim_func -def boolean_handling_before(a: T.Buffer[10, "bool"], b: T.Buffer[10, "bool"]) -> None: - for i0 in T.serial(10): - b[i0] = a[i0] - - -@T.prim_func -def boolean_handling_after(a: T.Buffer[10, "int8"], b: T.Buffer[10, "int8"]) -> None: - T.preflattened_buffer(a, [10], dtype="bool", data=a.data) - T.preflattened_buffer(b, [10], dtype="bool", data=b.data) - # body - for i0 in T.serial(10): - b[i0] = T.cast(T.cast(a[i0], "bool"), "int8") - - -def test_elementwise(): - _check(elementwise_func, flattened_elementwise_func) - - -def test_gpu_workload(): - _check(gpu_func, flattened_gpu_func) +class BaseCompare(tvm.testing.CompareBeforeAfter): + transform = tvm.transform.Sequential( + [ + tvm.tir.transform.FlattenBuffer(), + tvm.tir.transform.Simplify(), + ] + ) -def test_symbolic_shape(): - _check(symbolic_func, flattened_symbolic_func) - - -def test_multi_alloc(): - _check(multi_alloc_func, flattened_multi_alloc_func) +class TestElementwise(BaseCompare): + """2-d buffers are flattened to 1-d""" + def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]): + for i in T.serial(0, 16): + B_new = T.decl_buffer([1, 16], "float32") + for j in T.serial(0, 16): + B_new[0, j] = A[i, j] + 1.0 + for j in T.serial(0, 16): + C[i, j] = B_new[0, j] * 2.0 + + def expected(A: T.Buffer[256, "float32"], C: T.Buffer[256, "float32"]): + T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data) + T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data) + for i in T.serial(0, 16): + B_new_data = T.allocate([16], "float32", scope="global") + B_new = T.buffer_decl([16], "float32", scope="global", data=B_new_data) + for j in T.serial(0, 16): + B_new[j] = A[((i * 16) + j)] + 1.0 + for j in T.serial(0, 16): + C[((i * 16) + j)] = B_new[j] * 2.0 -def test_strided_buffer(): - _check(strided_buffer_func, flattened_strided_buffer_func) +class TestElementwiseWithoutDeclBuffer(BaseCompare): + """2-d buffers are flattened to 1-d -def test_lower_te(): - x = te.placeholder((1,)) - y = te.compute((1,), lambda i: x[i] + 2) - s = te.create_schedule(y.op) - orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y]) - mod = tvm.tir.transform.FlattenBuffer()(orig_mod) - tvm.ir.assert_structural_equal(mod, orig_mod) # FlattenBuffer should do nothing on TE + Like TestElementwise, but the TIR doesn't have the DeclBuffer + node. The T.buffer_decl declaration applies only during the + parsing the TVMScript, and doesn't occur in the TIR itself. In + this case, the allocation should be assumed to be targeting flat + memory, and should be flattened to a 1-d allocation. + """ + def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]): + for i in T.serial(0, 16): + B_new_data = T.allocate([1, 16], "float32", "global") + B_new = T.buffer_decl([1, 16], "float32", data=B_new_data) + for j in T.serial(0, 16): + B_new[0, j] = A[i, j] + 1.0 + for j in T.serial(0, 16): + C[i, j] = B_new[0, j] * 2.0 + + def expected(A: T.Buffer[256, "float32"], C: T.Buffer[256, "float32"]): + T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data) + T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data) + for i in T.serial(0, 16): + B_new_data = T.allocate([16], "float32", "global") + B_new = T.buffer_decl(16, "float32", data=B_new_data) + for j in T.serial(0, 16): + B_new[j] = A[((i * 16) + j)] + 1.0 + for j in T.serial(0, 16): + C[((i * 16) + j)] = B_new[j] * 2.0 + + +class TestGPU(BaseCompare): + """Buffer flattening may have indices based on GPU thread vars""" + + def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]): + i0 = T.env_thread("blockIdx.x") + i1 = T.env_thread("threadIdx.x") + i2 = T.env_thread("vthread") + + T.launch_thread(i0, 4) + T.launch_thread(i1, 2) + T.launch_thread(i2, 2) + B = T.decl_buffer([1, 16], "float32", scope="local") + for j in range(0, 16): + B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0 + for j in range(0, 16): + C[i0 * 4 + i1 * 2 + i2, j] = B[0, j] * 2.0 + + def expected(A: T.Buffer[256, "float32"], C: T.Buffer[256, "float32"]): + T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data) + T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data) + + i0 = T.env_thread("blockIdx.x") + i1 = T.env_thread("threadIdx.x") + i2 = T.env_thread("vthread") + + T.launch_thread(i0, 4) + T.launch_thread(i1, 2) + T.launch_thread(i2, 2) + B_data = T.allocate([16], "float32", scope="local") + B = T.buffer_decl([16], "float32", scope="local", data=B_data) + for j in range(0, 16): + B[j] = A[i0 * 64 + i1 * 32 + i2 * 16 + j] + 1.0 + for j in range(0, 16): + C[i0 * 64 + i1 * 32 + i2 * 16 + j] = B[j] * 2.0 + + +class TestSymbolic(BaseCompare): + """Dynamically-sized arrrays are flattened""" + + def before(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> None: + A = T.match_buffer(a, (n, m), "float32") + C = T.match_buffer(c, (n, m), "float32") + + for i in range(0, n): + B = T.decl_buffer([m], "float32") + for j in range(0, m): + B[j] = A[i, j] + 1.0 + for j in range(0, m): + C[i, j] = B[j] * 2.0 + + def expected(a: T.handle, c: T.handle, n: T.int32, m: T.int32) -> None: + A = T.match_buffer(a, n * m, "float32") + C = T.match_buffer(c, n * m, "float32") + T.preflattened_buffer(A, (n, m), "float32", data=A.data) + T.preflattened_buffer(C, (n, m), "float32", data=C.data) + + for i in range(0, n): + B_data = T.allocate([m], "float32", scope="global") + B = T.buffer_decl([m], "float32", scope="global", data=B_data) + for j in range(0, m): + B[j] = A[i * m + j] + 1.0 + for j in range(0, m): + C[i * m + j] = B[j] * 2.0 + + +class TestMultiAlloc(BaseCompare): + """If multiple allocations occur, all are flattened.""" + + def before(A: T.Buffer[(4, 32), "float32"], D: T.Buffer[(4, 32), "float32"]): + for i, j in T.grid(4, 32): + B = T.decl_buffer((4, 32), "float32", scope="global") + C = T.decl_buffer((4, 32), "float32", scope="global") + B[i, j] = A[i, j] + 1.0 + C[i, j] = A[i, j] + B[i, j] + D[i, j] = C[i, j] * 2.0 + + def expected(A: T.Buffer[128, "float32"], D: T.Buffer[128, "float32"]): + T.preflattened_buffer(A, (4, 32), "float32", data=A.data) + T.preflattened_buffer(D, (4, 32), "float32", data=D.data) + + for i, j in T.grid(4, 32): + B_data = T.allocate([128], "float32", scope="global") + B = T.buffer_decl([128], "float32", scope="global", data=B_data) + C_data = T.allocate([128], "float32", scope="global") + C = T.buffer_decl([128], "float32", scope="global", data=C_data) + B[i * 32 + j] = A[i * 32 + j] + 1.0 + C[i * 32 + j] = A[i * 32 + j] + B[i * 32 + j] + D[i * 32 + j] = C[i * 32 + j] * 2.0 + + +class TestStrided(BaseCompare): + """Indices for flattened buffers use the specified striding.""" + + def before(A: T.Buffer[(16, 16), "float32"], C: T.Buffer[(16, 16), "float32"]): + for i0 in T.serial(4): + B = T.decl_buffer([4, 17], "float32") + B_1 = T.buffer_decl([4, 16], dtype="float32", data=B.data, strides=[17, 1]) + for i1, j in T.grid(4, 16): + B_1[i1, j] = A[i0 * 4 + i1, j] + 1.0 + for i1, j in T.grid(4, 16): + C[i0 * 4 + i1, j] = B_1[i1, j] * 2.0 + + def expected(A: T.Buffer[256, "float32"], C: T.Buffer[256, "float32"]): + T.preflattened_buffer(A, [16, 16], dtype="float32", data=A.data) + T.preflattened_buffer(C, [16, 16], dtype="float32", data=C.data) + for i0 in T.serial(0, 4): + B_new_data = T.allocate([68], "float32", scope="global") + B_new = T.buffer_decl([68], "float32", scope="global", data=B_new_data) + for i1 in T.serial(0, 4): + for j in T.serial(0, 16): + B_new[i1 * 17 + j] = A[i0 * 64 + i1 * 16 + j] + 1.0 + for i1 in T.serial(0, 4): + for j in T.serial(0, 16): + C[i0 * 64 + i1 * 16 + j] = B_new[i1 * 17 + j] * 2.0 + + +class TestBoolean(BaseCompare): + """Boolean buffers should be replaced by a backing int8 array""" + + def before(A: T.Buffer[10, "bool"], B: T.Buffer[10, "bool"]) -> None: + for i0 in T.serial(10): + B[i0] = A[i0] + + def expected(A: T.Buffer[10, "int8"], B: T.Buffer[10, "int8"]) -> None: + T.preflattened_buffer(A, [10], dtype="bool", data=A.data) + T.preflattened_buffer(B, [10], dtype="bool", data=B.data) + # body + for i0 in T.serial(10): + B[i0] = T.cast(T.cast(A[i0], "bool"), "int8") + + +class TestLowerTE(BaseCompare): + """FlattenBuffer should do nothing on TE-based functions""" + + def before(self): + x = te.placeholder((1,)) + y = te.compute((1,), lambda i: x[i] + 2) + s = te.create_schedule(y.op) + mod = tvm.driver.build_module.schedule_to_module(s, [x, y]) + return mod["main"] + + expected = before + + +class TestFlattenInsideBlock(BaseCompare): + """Flattening access inside a block flattens the accessed region.""" + + def before(): + A = T.alloc_buffer([32, 32]) + for i, j in T.grid(32, 32): + with T.block("block"): + T.reads(A[i, j]) + T.evaluate(A[i, j]) + + def expected(): + A = T.alloc_buffer([1024]) + for i, j in T.grid(32, 32): + with T.block("block"): + T.reads(A[i * 32 + j]) + T.evaluate(A[i * 32 + j]) + + +class TestNoChangeTo2DPhysicalBuffer(BaseCompare): + """Flattening preserves axis separators.""" + + def before(): + A = T.alloc_buffer([32, 32], axis_separators=[1]) + for i, j in T.grid(32, 32): + T.evaluate(A[i, j]) + + expected = before + + +class TestFlattenAllocBufferWithAxisSeparators(BaseCompare): + """Flattening preserves axis separators""" + + def before(): + A = T.alloc_buffer([2, 3, 5, 7, 11, 13], axis_separators=[3]) + for i0, i1, i2, i3, i4, i5 in T.grid(2, 3, 5, 7, 11, 13): + T.evaluate(A[i0, i1, i2, i3, i4, i5]) + + def expected(): + A = T.alloc_buffer([30, 1001], axis_separators=[1]) + for i0, i1, i2, i3, i4, i5 in T.grid(2, 3, 5, 7, 11, 13): + T.evaluate(A[i0 * 15 + i1 * 5 + i2, i3 * 143 + i4 * 13 + i5]) + + +class TestFlattenDeclBufferWithAxisSeparators(BaseCompare): + """Flattening preserves axis separators + + Like TestFlattenAllocBufferWithAxisSeparators, but the allocations + is done using Allocate/DeclBuffer, rather than through + BlockNode::alloc_buffers. + """ + + def before(): + A = T.decl_buffer([2, 3, 5, 7, 11, 13], axis_separators=[3]) + for i0, i1, i2, i3, i4, i5 in T.grid(2, 3, 5, 7, 11, 13): + T.evaluate(A[i0, i1, i2, i3, i4, i5]) + + def expected(): + A_data = T.allocate([30, 1001], dtype="float32", scope="global") + A = T.buffer_decl( + [30, 1001], dtype="float32", scope="global", axis_separators=[1], data=A_data + ) + for i0, i1, i2, i3, i4, i5 in T.grid(2, 3, 5, 7, 11, 13): + T.evaluate(A[i0 * 15 + i1 * 5 + i2, i3 * 143 + i4 * 13 + i5]) + + +def test_lower_2d_physical_memory(): + """Axis separators should preserve 2-d buffers through lowering. -def test_boolean_handling(): - _check(boolean_handling_before, boolean_handling_after) + A catch-all test to ensure that defining axis_separators is + sufficient to maintain non-flat buffer descriptions through all + lowering steps. + """ + + # This test doesn't use CompareBeforeAfter, because the after step + # is not currently expressible in TVMScript. This test can be + # re-written after https://github.com/apache/tvm/pull/12412. + + @T.prim_func + def func(): + buf = T.alloc_buffer( + [1, 1], + dtype="int32", + scope="global", + axis_separators=[1], + ) + buf[0, 0] = 0 + + lowered = tvm.lower(func)["main"] + assert isinstance(lowered.body, tvm.tir.Allocate) + assert list(lowered.body.extents) == [1, 1], ( + "Non-flat buffer allocations, " + "marked by axis_separators, " + "flattened to flat memory allocation." + ) if __name__ == "__main__": diff --git a/tests/python/unittest/test_tir_transform_lower_opaque_block.py b/tests/python/unittest/test_tir_transform_lower_opaque_block.py index f8f3e3a5aced..824cef174055 100644 --- a/tests/python/unittest/test_tir_transform_lower_opaque_block.py +++ b/tests/python/unittest/test_tir_transform_lower_opaque_block.py @@ -54,8 +54,7 @@ def transformed_elementwise_func(a: T.handle, c: T.handle) -> None: A = T.match_buffer(a, (16, 16), "float32") C = T.match_buffer(c, (16, 16), "float32") for i in T.serial(0, 16): - B_new_data = T.allocate([1, 16], "float32", "global") - B_new = T.buffer_decl(shape=[1, 16], dtype="float32", data=B_new_data) + B_new = T.decl_buffer(shape=[1, 16], dtype="float32") for j in T.serial(0, 16): B_new[0, j] = A[i, j] + 1.0 for j in T.serial(0, 16): @@ -97,8 +96,7 @@ def transformed_gpu_func(a: T.handle, c: T.handle) -> None: T.launch_thread(i0, 4) T.launch_thread(i1, 2) T.launch_thread(i2, 2) - B_data = T.allocate([1, 16], "float32", "local") - B = T.buffer_decl(shape=[1, 16], dtype="float32", scope="local", data=B_data) + B = T.decl_buffer(shape=[1, 16], dtype="float32", scope="local") for j in range(0, 16): B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0 for j in range(0, 16): @@ -133,8 +131,7 @@ def transformed_symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32) C = T.match_buffer(c, (n, m), "float32") for i in range(0, n): - B_data = T.allocate([m], "float32", "global") - B = T.buffer_decl(shape=[m], dtype="float32", data=B_data) + B = T.decl_buffer(shape=[m], dtype="float32") for j in range(0, m): B[j] = A[i, j] + 1.0 for j in range(0, m): @@ -207,10 +204,8 @@ def transformed_multi_alloc_func(a: T.handle, d: T.handle) -> None: D = T.match_buffer(d, (32), "float32") for i in range(0, 32): - B_data = T.allocate((32,), "float32", "global") - B = T.buffer_decl(shape=(32,), dtype="float32", data=B_data) - C_data = T.allocate((32,), "float32", "global") - C = T.buffer_decl(shape=(32,), dtype="float32", data=C_data) + B = T.decl_buffer(shape=(32,), dtype="float32") + C = T.decl_buffer(shape=(32,), dtype="float32") B[i] = A[i] + 1.0 C[i] = A[i] + B[i] D[i] = C[i] * 2.0 @@ -246,12 +241,11 @@ def transformed_strided_buffer_func( # body for i0 in T.serial(4): B_data = T.allocate([4, 17], "float32", "global") - B = T.buffer_decl(shape=[4, 17], dtype="float32", data=B_data) - B_1 = T.buffer_decl([4, 16], dtype="float32", data=B.data, strides=[17, 1]) + B = T.decl_buffer(shape=[4, 16], dtype="float32", strides=[17, 1], data=B_data) for i1, j in T.grid(4, 16): - B_1[i1, j] = A[i0 * 4 + i1, j] + T.float32(1) + B[i1, j] = A[i0 * 4 + i1, j] + T.float32(1) for i1, j in T.grid(4, 16): - C[i0 * 4 + i1, j] = B_1[i1, j] * T.float32(2) + C[i0 * 4 + i1, j] = B[i1, j] * T.float32(2) @T.prim_func From 299ca267e7641b5fa6e78dd131d0574e310f9a13 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Thu, 8 Sep 2022 09:35:58 -0700 Subject: [PATCH 131/704] [TIR] Update region min/extent in ReplaceBufferMutator (#12725) Prior to this commit, `ReplaceBufferMutator` only checks `BufferRegionNode::buffer` to determine if a `BufferRegion` needs to be replaced, and doesn't check the `BufferRegionNode::region`. As a result, updating `T.reads(A[B[i]])` would fail to replace `B`. This commit checks `BufferRegionNode::region` for buffer usage to resolve this issue. --- src/tir/schedule/transform.cc | 27 ++++++++++++++++--- .../test_tir_schedule_set_axis_separator.py | 24 +++++++++++++++++ 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc index 1ebaf202d487..c11fa656d6da 100644 --- a/src/tir/schedule/transform.cc +++ b/src/tir/schedule/transform.cc @@ -138,9 +138,30 @@ Stmt ReplaceBufferMutator::VisitStmt_(const BlockNode* block) { return this->VisitMatchBufferRegion(match_buffer); }; auto f_mutate_read_write_region = [this](const BufferRegion& buffer_region) { - auto it = buffer_var_map_.find(buffer_region->buffer->data.get()); - return it == buffer_var_map_.end() ? buffer_region - : BufferRegion(it->second, buffer_region->region); + auto region = MutateArray(buffer_region->region, [this](const Range& range) { + PrimExpr min = VisitExpr(range->min); + PrimExpr extent = VisitExpr(range->extent); + if (min.same_as(range->min) && extent.same_as(range->extent)) { + return range; + } else { + return Range::FromMinExtent(min, extent); + } + }); + + Buffer buf = [&]() { + auto it = buffer_var_map_.find(buffer_region->buffer->data.get()); + if (it == buffer_var_map_.end()) { + return buffer_region->buffer; + } else { + return it->second; + } + }(); + + if (buf.same_as(buffer_region->buffer) && region.same_as(buffer_region->region)) { + return buffer_region; + } else { + return BufferRegion(buf, region); + } }; auto f_mutate_alloc_buffers = [this](const Buffer& buffer) { auto it = buffer_var_map_.find(buffer->data.get()); diff --git a/tests/python/unittest/test_tir_schedule_set_axis_separator.py b/tests/python/unittest/test_tir_schedule_set_axis_separator.py index 9502da182926..b432fbb61066 100644 --- a/tests/python/unittest/test_tir_schedule_set_axis_separator.py +++ b/tests/python/unittest/test_tir_schedule_set_axis_separator.py @@ -154,6 +154,30 @@ def test_set_axis_separator_subregion(use_sugared_transform): tvm.ir.assert_structural_equal(element_wise_subregion_match_set_axis_separator, s.mod["main"]) verify_trace_roundtrip(sch=s, mod=func) +class TestIndexedLookup(tvm.testing.CompareBeforeAfter): + def transform(self): + def func(mod): + sch = tir.Schedule(mod) + sch.set_axis_separator('block', 'B', [1]) + return sch.mod + return func + + @T.prim_func + def before(): + A = T.alloc_buffer([4,4], dtype="int32") + B = T.alloc_buffer([1,1], dtype="int32") + for j in T.serial(4): + with T.block('block'): + A[B[0,0],j] = 0 + + @T.prim_func + def expected(): + A = T.alloc_buffer([4,4], dtype="int32") + B = T.alloc_buffer([1,1], dtype="int32", axis_separators=[1]) + for j in T.serial(4): + with T.block('block'): + A[B[0,0],j] = 0 + if __name__ == "__main__": tvm.testing.main() From 64031d56d634a535c8e3832d9231855b688f0648 Mon Sep 17 00:00:00 2001 From: Robert Kimball Date: Thu, 8 Sep 2022 15:30:38 -0700 Subject: [PATCH 132/704] Move static array initialization into a function go avoid link errors (#12678) * Move static array initialization into a function go avoid link errors * Fix line length --- include/tvm/runtime/container/map.h | 63 +++++++++++++++-------------- src/runtime/container.cc | 4 -- 2 files changed, 33 insertions(+), 34 deletions(-) diff --git a/include/tvm/runtime/container/map.h b/include/tvm/runtime/container/map.h index 4c76a3b0ad4f..53c37cc20e6b 100644 --- a/include/tvm/runtime/container/map.h +++ b/include/tvm/runtime/container/map.h @@ -1038,10 +1038,10 @@ class DenseMapNode : public MapNode { new (&Data()) KVType(std::move(v)); } /*! \brief If the entry has next entry on the linked list */ - bool HasNext() const { return kNextProbeLocation[Meta() & 0b01111111] != 0; } + bool HasNext() const { return NextProbeLocation(Meta() & 0b01111111) != 0; } /*! \brief Move the entry to the next entry on the linked list */ bool MoveToNext(const DenseMapNode* self, uint8_t meta) { - uint64_t offset = kNextProbeLocation[meta & 0b01111111]; + uint64_t offset = NextProbeLocation(meta & 0b01111111); if (offset == 0) { index = 0; block = nullptr; @@ -1066,7 +1066,7 @@ class DenseMapNode : public MapNode { /*! \brief Get the next empty jump */ bool GetNextEmpty(const DenseMapNode* self, uint8_t* jump, ListNode* result) const { for (uint8_t idx = 1; idx < kNumJumpDists; ++idx) { - ListNode candidate((index + kNextProbeLocation[idx]) & (self->slots_), self); + ListNode candidate((index + NextProbeLocation(idx)) & (self->slots_), self); if (candidate.IsEmpty()) { *jump = idx; *result = candidate; @@ -1086,33 +1086,36 @@ class DenseMapNode : public MapNode { uint32_t fib_shift_; /*! \brief array of data blocks */ Block* data_; - /* clang-format off */ - /*! \brief Candidates of probing distance */ - TVM_DLL static constexpr uint64_t kNextProbeLocation[kNumJumpDists] { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - // Quadratic probing with triangle numbers. See also: - // 1) https://en.wikipedia.org/wiki/Quadratic_probing - // 2) https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/ - // 3) https://github.com/skarupke/flat_hash_map - 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, - 136, 153, 171, 190, 210, 231, 253, 276, 300, 325, - 351, 378, 406, 435, 465, 496, 528, 561, 595, 630, - 666, 703, 741, 780, 820, 861, 903, 946, 990, 1035, - 1081, 1128, 1176, 1225, 1275, 1326, 1378, 1431, 1485, 1540, - 1596, 1653, 1711, 1770, 1830, 1891, 1953, 2016, 2080, 2145, - 2211, 2278, 2346, 2415, 2485, 2556, 2628, - // larger triangle numbers - 8515, 19110, 42778, 96141, 216153, - 486591, 1092981, 2458653, 5532801, 12442566, - 27993903, 62983476, 141717030, 318844378, 717352503, - 1614057336, 3631522476, 8170957530, 18384510628, 41364789378, - 93070452520, 209408356380, 471168559170, 1060128894105, 2385289465695, - 5366898840628, 12075518705635, 27169915244790, 61132312065111, 137547689707000, - 309482283181501, 696335127828753, 1566753995631385, 3525196511162271, 7931691992677701, - 17846306936293605, 40154190677507445, 90346928918121501, 203280589587557251, 457381325854679626, - 1029107982097042876, 2315492959180353330, 5209859154120846435, - }; - /* clang-format on */ + static uint64_t NextProbeLocation(size_t index) { + /* clang-format off */ + /*! \brief Candidates of probing distance */ + static const uint64_t kNextProbeLocation[kNumJumpDists] { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + // Quadratic probing with triangle numbers. See also: + // 1) https://en.wikipedia.org/wiki/Quadratic_probing + // 2) https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/ + // 3) https://github.com/skarupke/flat_hash_map + 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, + 136, 153, 171, 190, 210, 231, 253, 276, 300, 325, + 351, 378, 406, 435, 465, 496, 528, 561, 595, 630, + 666, 703, 741, 780, 820, 861, 903, 946, 990, 1035, + 1081, 1128, 1176, 1225, 1275, 1326, 1378, 1431, 1485, 1540, + 1596, 1653, 1711, 1770, 1830, 1891, 1953, 2016, 2080, 2145, + 2211, 2278, 2346, 2415, 2485, 2556, 2628, + // larger triangle numbers + 8515, 19110, 42778, 96141, 216153, + 486591, 1092981, 2458653, 5532801, 12442566, + 27993903, 62983476, 141717030, 318844378, 717352503, + 1614057336, 3631522476, 8170957530, 18384510628, 41364789378, + 93070452520, 209408356380, 471168559170, 1060128894105, 2385289465695, + 5366898840628, 12075518705635, 27169915244790, 61132312065111, 137547689707000, + 309482283181501, 696335127828753, 1566753995631385, 3525196511162271, 7931691992677701, + 17846306936293605, 40154190677507445, 90346928918121501, 203280589587557251, + 457381325854679626, 1029107982097042876, 2315492959180353330, 5209859154120846435, + }; + /* clang-format on */ + return kNextProbeLocation[index]; + } friend class MapNode; }; diff --git a/src/runtime/container.cc b/src/runtime/container.cc index 159404be5351..adcaecbc64cf 100644 --- a/src/runtime/container.cc +++ b/src/runtime/container.cc @@ -180,10 +180,6 @@ TVM_REGISTER_GLOBAL("runtime.MapItems").set_body([](TVMArgs args, TVMRetValue* r *ret = std::move(rkvs); }); -#if (USE_FALLBACK_STL_MAP == 0) -TVM_DLL constexpr uint64_t DenseMapNode::kNextProbeLocation[]; -#endif - // Closure TVM_REGISTER_OBJECT_TYPE(ClosureObj); From 89ce171b8697d223032f53b5e14c459332316da8 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 8 Sep 2022 18:51:32 -0700 Subject: [PATCH 133/704] [TIR, Schedule] Check consumer in-bound and covered in reverse_compute_inline (#12717) * [TIR, Schedule] Generate consumer-in-bound predicate after reverse_compute_inline * Check consumer block iters are covered * fix lint --- src/tir/schedule/primitive/compute_inline.cc | 131 ++++++++++++++++-- .../test_tir_schedule_compute_inline.py | 61 ++++++++ 2 files changed, 178 insertions(+), 14 deletions(-) diff --git a/src/tir/schedule/primitive/compute_inline.cc b/src/tir/schedule/primitive/compute_inline.cc index bfda66036fe3..2ea641a2cbd4 100644 --- a/src/tir/schedule/primitive/compute_inline.cc +++ b/src/tir/schedule/primitive/compute_inline.cc @@ -30,7 +30,8 @@ static const char kErrBodyReverseInline[] = R"(The body of the inlined block sho `B[...] = g(i, j, k, A[f(i, j, k, ...)] ...)`, where A is the only buffer the block consumes, whose indices are distinct atomic variables, and there should be no variables other than the index variables), and f is a bijective affine -mapping)"; +mapping and there should not be predicates in the inlined block. The iter domains of the inlined +block should be covered by the producer block.)"; class HasInitBlock : public ScheduleError { public: @@ -161,16 +162,25 @@ class NonSingleProducerError : public ScheduleError { IRModule mod_; Block block_; - static void Check(const ScheduleState& self, const StmtSRef& consumer_block_sref, - const StmtSRef& scope_root_sref) { + /*! + * \brief Check if the block has a single producer. + * \param self The schedule state + * \param block_sref The sref of the block to be checked + * \param scope_root_sref The sref of the scope root + * \return The sref of the producer block if the block has a single producer + * \throw ScheduleError if the block does not have a single producer + */ + static StmtSRef Check(const ScheduleState& self, const StmtSRef& consumer_block_sref, + const StmtSRef& scope_root_sref) { BlockScope scope = self->GetBlockScope(scope_root_sref); Array producers = scope->GetDepsByDst(consumer_block_sref); + StmtSRef producer_block_sref{nullptr}; if (producers.size() == 1 && producers[0]->kind == DepKind::kRAW) { - const StmtSRef& producer_block_sref = producers[0]->src; + producer_block_sref = producers[0]->src; if (IsCompleteBlock(self, producer_block_sref, scope_root_sref)) { Array consumers = scope->GetDepsBySrc(producer_block_sref); if (consumers.size() == 1) { - return; + return producer_block_sref; } } } @@ -521,11 +531,28 @@ class ReverseComputeInliner : public BaseInliner { }; public: - explicit ReverseComputeInliner(const Buffer& inlined_buffer, const Block& consumer_block, + explicit ReverseComputeInliner(const Buffer& inlined_buffer, const BlockNode* producer_block, + const BlockRealize& consumer_block_realize, const StmtSRef& scope_root_sref) - : BaseInliner(inlined_buffer, consumer_block, scope_root_sref) {} + : BaseInliner(inlined_buffer, consumer_block_realize->block, scope_root_sref), + producer_block_(producer_block), + consumer_block_(consumer_block_realize->block.get()) { + // Initialize the predicates to ensure consumer block iters are in-bound + consumer_iter_in_bound_ = Bool(true); + for (const IterVar& iter : consumer_block_realize->block->iter_vars) { + consumer_iter_in_bound_ = + consumer_iter_in_bound_ && + (iter->var >= iter->dom->min && iter->var < iter->dom->min + iter->dom->extent); + } + } - bool BodyPatternAllowInline(const Block& consumer_block) { + bool BodyPatternAllowInline(const BlockRealize& consumer_block_realize) { + const Block& consumer_block = consumer_block_realize->block; + + if (!is_one(consumer_block_realize->predicate)) { + // Failure: Predicate is the consumer block is not supported + return false; + } if (inlined_store_ == nullptr) { // Failure: block body is not BufferStore return false; @@ -557,13 +584,25 @@ class ReverseComputeInliner : public BaseInliner { /*input_iters=*/consumer_iter_doms, /*predicate=*/true, /*check_level=*/arith::IterMapLevel::Bijective, - /*analyzer=*/&analyzer, + /*analyzer=*/&analyzer_, /*simplify_trivial_iterators=*/false); buffer_load_iter_map_ = res->indices; if (buffer_load_iter_map_.empty()) { // Failure: indices of BufferLoad are not bijective affine return false; } + + const BufferStoreNode* producer_store = producer_block_->body.as(); + if (producer_store == nullptr) { + // Failure: producer block body is not BufferStore + return false; + } + CreateInverseMapping(producer_store->indices); + if (!CheckConsumerCovered()) { + // Failure: consumer block iter domains are not covered by the producer block + return false; + } + return true; } @@ -571,6 +610,34 @@ class ReverseComputeInliner : public BaseInliner { using BaseInliner::VisitExpr_; using BaseInliner::VisitStmt_; + /*! \brief Generate the predicate after inlining based on the consumer predicate */ + PrimExpr BuildInlinedConsumerPredicate(const BlockRealizeNode* producer_block_realize) { + // Bind the producer block iter domains for simplification + Map subst_map; + for (int i = 0, n = producer_block_realize->iter_values.size(); i < n; ++i) { + const IterVar& iter = producer_block_realize->block->iter_vars[i]; + analyzer_.Bind(iter->var, Range::FromMinExtent(iter->dom->min, iter->dom->extent)); + subst_map.Set(iter->var, producer_block_realize->iter_values[i]); + } + // Substitute the consumer block iters with the corresponding iters in the producer blocks + PrimExpr predicate = Substituter(this)(consumer_iter_in_bound_); + // Simplify the predicate using the producer block iter domains + predicate = analyzer_.Simplify(predicate); + // Substitute the producer block iters with the its bindings since the predicate in BlockRealize + // should not contain the block iters + predicate = Substitute(predicate, subst_map); + return predicate; + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + BlockRealize new_block_realize = Downcast(StmtMutator::VisitStmt_(op)); + if (op->block.get() == producer_block_) { + new_block_realize.CopyOnWrite()->predicate = + BuildInlinedConsumerPredicate(new_block_realize.get()); + } + return std::move(new_block_realize); + } + Stmt VisitStmt_(const BufferStoreNode* _store) final { BufferStore store = Downcast(StmtExprMutator::VisitStmt_(_store)); if (!store->buffer.same_as(inlined_buffer_)) { @@ -579,6 +646,32 @@ class ReverseComputeInliner : public BaseInliner { return ReplaceInlinedBuffer(std::move(store)); } + /*! + * \brief Check the consumer block iter domains are covered by the producer block iter domains + * \return Whether the consumer block iter domains are covered + */ + bool CheckConsumerCovered() { + Map producer_iter_doms; + for (const IterVar& iter_var : producer_block_->iter_vars) { + producer_iter_doms.Set(iter_var, arith::IntSet::FromRange(iter_var->dom)); + } + // For each block iter in the consumer block, find the corresponding expression in the producer + for (const IterVar& iter : consumer_block_->iter_vars) { + if (auto it = idx_sub_.find(iter->var.get()); it != idx_sub_.end()) { + const PrimExpr& producer_iter = it->second; + arith::IntSet producer_iter_range = arith::EvalSet(producer_iter, producer_iter_doms); + if (analyzer_.CanProve(producer_iter_range.min() > iter->dom->min) || + analyzer_.CanProve(producer_iter_range.max() < + iter->dom->min + iter->dom->extent - 1)) { + return false; + } + } else { + return false; + } + } + return true; + } + /*! * \brief Apply the inverse of `buffer_load_iter_map_` to producer indices. Update `idx_sub_` with * the result. It will be later used to transform the BufferStore indices of the producer. @@ -592,7 +685,6 @@ class ReverseComputeInliner : public BaseInliner { } Stmt ReplaceInlinedBuffer(BufferStore producer) { - CreateInverseMapping(producer->indices); producer_rhs_ = producer->value; return Substituter(this)(GetRef(inlined_store_)); } @@ -647,8 +739,16 @@ class ReverseComputeInliner : public BaseInliner { Array buffer_load_indices_; /*! \brief The IterMap representing the indices of the consumer's BufferLoad */ Array buffer_load_iter_map_{nullptr}; + /*! \brief The producer block */ + const BlockNode* producer_block_{nullptr}; + /* \brief The consumer block */ + const BlockNode* consumer_block_{nullptr}; + /*! \brief The predicate to ensure the consumer block iters are in-bound. It will be inserted + * as the predicate of the producer block after inlining. + */ + PrimExpr consumer_iter_in_bound_{nullptr}; /*! \brief The arithmetic analyzer */ - arith::Analyzer analyzer; + arith::Analyzer analyzer_; }; void ComputeInlineImpl(ScheduleState self, const StmtSRef& producer_block_sref, @@ -700,6 +800,7 @@ void ReverseComputeInlineImpl(ScheduleState self, const StmtSRef& consumer_block bool check_only = false) { const BlockNode* _consumer_block = TVM_SREF_TO_BLOCK(consumer_block_sref); Block consumer_block = GetRef(_consumer_block); + BlockRealize consumer_block_realize = GetBlockRealize(self, consumer_block_sref); HasInitBlock::Check(self->mod, consumer_block); // Step 1. Get the scope block StmtSRef scope_root_sref = GetScopeRoot(self, consumer_block_sref, // @@ -709,10 +810,12 @@ void ReverseComputeInlineImpl(ScheduleState self, const StmtSRef& consumer_block // Step 2. Check completeness CheckCompleteBlock(self, consumer_block_sref, scope_root_sref); // Step 3. Check if the consumer has a single complete producer - NonSingleProducerError::Check(self, consumer_block_sref, scope_root_sref); + StmtSRef producer_block_sref = + NonSingleProducerError::Check(self, consumer_block_sref, scope_root_sref); // Step 4. Analyze the block body - ReverseComputeInliner inliner(inlined_buffer, consumer_block, scope_root_sref); - if (!inliner.BodyPatternAllowInline(consumer_block)) { + ReverseComputeInliner inliner(inlined_buffer, producer_block_sref->StmtAs(), + consumer_block_realize, scope_root_sref); + if (!inliner.BodyPatternAllowInline(consumer_block_realize)) { throw BodyAnalysisError(true, self->mod, consumer_block); } // Step 5. Create a plan that removes the leaf block to be inlined diff --git a/tests/python/unittest/test_tir_schedule_compute_inline.py b/tests/python/unittest/test_tir_schedule_compute_inline.py index ec19402969e3..20eafabc7a22 100644 --- a/tests/python/unittest/test_tir_schedule_compute_inline.py +++ b/tests/python/unittest/test_tir_schedule_compute_inline.py @@ -585,6 +585,47 @@ def exp_exp_opaque_access_with_tvm_access_ptr_inlined( ) +@T.prim_func +def elementwise_overcomputed_producer( + A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(127, 127), "float32"] +) -> None: + B = T.alloc_buffer((128, 128)) + for i, j in T.grid(128, 128): + with T.block("B"): + vi, vj = T.axis.remap("SS", [i, j]) + B[vi, vj] = A[vi, vj] * 2.0 + for i, j in T.grid(127, 127): + with T.block("C"): + cvi, cvj = T.axis.remap("SS", [i, j]) + C[cvi, cvj] = B[cvi, cvj] + 1.0 + + +@T.prim_func +def elementwise_overcomputed_producer_reverse_inlined( + A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(127, 127), "float32"] +) -> None: + for i, j in T.grid(128, 128): + with T.block("B"): + vi, vj = T.axis.remap("SS", [i, j]) + T.where(i < 127 and j < 127) + C[vi, vj] = A[vi, vj] * 2.0 + 1.0 + + +@T.prim_func +def elementwise_producer_not_cover_consumer( + A: T.Buffer[(128, 128), "float32"], D: T.Buffer[(256, 128), "float32"] +) -> None: + B = T.alloc_buffer((128, 128)) + for i, j in T.grid(128, 128): + with T.block("B"): + vi, vj = T.axis.remap("SS", [i, j]) + B[vi, vj] = A[vi, vj] * 2.0 + for i, j in T.grid(256, 128): + with T.block("C"): + vi, vj = T.axis.remap("SS", [i, j]) + D[vi, vj] = T.if_then_else(vi >= 128, B[vi - 128, vj], T.float32(0), dtype="float32") + + # pylint: enable=no-member,invalid-name,unused-variable use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True}) @@ -822,5 +863,25 @@ def test_compute_inline_opaque_access_with_tvm_access_ptr(use_block_name): ) +def test_reverse_compute_inline_overcomputed_producer(use_block_name): + """Test reverse compute inline overcomputed producer""" + sch = tir.Schedule(elementwise_overcomputed_producer, debug_mask="all") + compute = "C" if use_block_name else sch.get_block("C") + sch.reverse_compute_inline(compute) + tvm.ir.assert_structural_equal( + elementwise_overcomputed_producer_reverse_inlined, sch.mod["main"] + ) + + +def test_reverse_compute_inline_error_producer_not_cover_consumer(use_block_name): + """Test reverse compute inline failure when the inlined block iter domains are not covered by + its producer + """ + sch = tir.Schedule(elementwise_producer_not_cover_consumer, debug_mask="all") + compute = "C" if use_block_name else sch.get_block("C") + with pytest.raises(tvm.tir.ScheduleError): + sch.reverse_compute_inline(compute) + + if __name__ == "__main__": tvm.testing.main() From 1c5ffc67ad2497a2d34509e0599b3a787fcd464d Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Fri, 9 Sep 2022 00:07:09 -0700 Subject: [PATCH 134/704] [ci][docker] Use CMake 3.20.0 for cortexm (#12744) The Zephyr project builds require 3.20.0 to work correctly Co-authored-by: driazati --- docker/Dockerfile.ci_cortexm | 2 +- docker/install/ubuntu_install_cmake_source.sh | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm index fb3c10d393f0..d646704bb0a8 100644 --- a/docker/Dockerfile.ci_cortexm +++ b/docker/Dockerfile.ci_cortexm @@ -33,7 +33,7 @@ COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh RUN bash /install/ubuntu1804_install_python.sh COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh -RUN bash /install/ubuntu_install_cmake_source.sh +RUN bash /install/ubuntu_install_cmake_source.sh 3.20.0 COPY install/ubuntu1804_install_python_venv.sh /install/ubuntu1804_install_python_venv.sh RUN bash /install/ubuntu1804_install_python_venv.sh diff --git a/docker/install/ubuntu_install_cmake_source.sh b/docker/install/ubuntu_install_cmake_source.sh index 030cb4ea0406..702130f07964 100755 --- a/docker/install/ubuntu_install_cmake_source.sh +++ b/docker/install/ubuntu_install_cmake_source.sh @@ -20,13 +20,19 @@ set -e set -u set -o pipefail -v=3.18 -version=3.18.4 +if [ -z ${1+x} ]; then + version=3.18.4 +else + version=$1 +fi + +v=$(echo $version | sed 's/\(.*\)\..*/\1/g') +echo "Installing cmake $version ($v)" wget https://cmake.org/files/v${v}/cmake-${version}.tar.gz tar xvf cmake-${version}.tar.gz cd cmake-${version} ./bootstrap -make -j$(nproc) +make -j"$(nproc)" make install cd .. rm -rf cmake-${version} cmake-${version}.tar.gz From cb08a1251f247ee79d3ede2b0e843cc11c4925d0 Mon Sep 17 00:00:00 2001 From: Alexander Pivovarov Date: Fri, 9 Sep 2022 01:29:57 -0700 Subject: [PATCH 135/704] [TF] Add DenseBincount support (#12728) --- python/tvm/relay/frontend/tensorflow_ops.py | 55 +++++++++++++++++++ .../frontend/tensorflow/test_forward.py | 41 ++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py index c94a4ef2e6aa..4598f4f09a05 100644 --- a/python/tvm/relay/frontend/tensorflow_ops.py +++ b/python/tvm/relay/frontend/tensorflow_ops.py @@ -2868,6 +2868,60 @@ def _impl(inputs, attr, params, mod): return _impl +def _dense_bincount(): + def _impl(inputs, attr, params, mod): + input = inputs[0] # input: int32, int64. 1D or 2D int Tensor + size = inputs[1] # size: non-negative int scalar Tensor + # weights: int32, int64, float32, or float64 Tensor with the same shape as arr + # or a length-0 Tensor, in which case it acts as all weights equal to 1. + weights = inputs[2] + # Returns: Output: 1D Tensor with length equal to size + # or 2D Tensor with [batch_size, size]. + # The counts or summed weights for each value in the range [0, size). + + input_dtype = _infer_type(input, mod).checked_type.dtype + input_shape = _infer_shape(input, mod) + is_2d_input = len(input_shape) == 2 + + if input_dtype == "int64": + warnings.warn( + "Casting an int64 input to int32, since we do not have int64 atomic add" + "needed for bincount yet." + ) + input = _op.cast(input, "int32") + + is_weights_zero_tensor = True + if weights: + weights_shape = _infer_shape(weights, mod) + is_weights_zero_tensor = weights_shape == (0,) + + # Output should have the same dtype as weights. + if is_weights_zero_tensor: + # if weights are length-0 Tensor - output dtype is float32 + out_dtype = "float32" + updates = _op.cast(_op.ones_like(input), out_dtype) + else: + out_dtype = _infer_type(weights, mod).checked_type.dtype + updates = weights + + if is_2d_input: + batch_arr = _op.take(_op.shape_of(input), _expr.const([0])) + size_arr = _op.reshape(size, [1]) + counts_shape = _op.concatenate([batch_arr, size_arr], axis=0) + counts = _op.zeros(counts_shape, out_dtype) + out = _op.scatter_add(counts, input, updates, axis=1) + else: + counts_shape = _op.reshape(size, [1]) + counts = _op.zeros(counts_shape, out_dtype) + out = _op.scatter_add(counts, input, updates, axis=0) + + if attr["binary_output"]: + out = _op.cast(_op.cast(out, "bool"), out_dtype) + return out + + return _impl + + # _convert_map defines maps of name to converter functor(callable) # for 1 to 1 mapping, use Renamer if nothing but name is different # use AttrCvt if attributes need to be converted @@ -2913,6 +2967,7 @@ def _impl(inputs, attr, params, mod): "Cosh": AttrCvt("cosh"), "CropAndResize": _crop_and_resize(), "DecodeJpeg": _decode_image(), + "DenseBincount": _dense_bincount(), "DepthToSpace": _depth_to_space(), "DepthwiseConv2dNative": _conv("depthwise"), "Dilation2D": _dilation2d(), diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py index c679425beab1..ebeb35e08f5d 100755 --- a/tests/python/frontend/tensorflow/test_forward.py +++ b/tests/python/frontend/tensorflow/test_forward.py @@ -5758,5 +5758,46 @@ def test_invert_permutation(): compare_tf_with_tvm(x, "Placeholder:0", out_name, no_gpu=False) +####################################################################### +# DenseBincount +# ---- + + +def _test_dense_bincount(in_shape, size, weights, binary_output): + with tf.Graph().as_default(): + inputs = [] + data = [] + inputs.append(tf.placeholder(shape=in_shape, dtype="int32", name="input0")) + data.append(np.random.uniform(0, size, size=in_shape).astype("int32")) + inputs.append(tf.placeholder(shape=(), dtype="int32", name="size")) + data.append(np.array(size, "int32")) + if weights: + inputs.append(tf.placeholder(shape=in_shape, dtype="float32", name="weights")) + data.append(np.reshape(weights, in_shape).astype("float32")) + else: + inputs.append(tf.placeholder(shape=(0,), dtype="float32", name="weights")) + data.append(np.array([], "float32")) + result = tf.raw_ops.DenseBincount( + input=data[0], + size=data[1], + weights=data[2], + binary_output=binary_output, + ) + compare_tf_with_tvm(data, [a.name for a in inputs], result.name, mode="vm") + + +def test_forward_dense_bincount(): + """Test DenseBincount Op""" + for binary_output in [False, True]: + # 2D input + _test_dense_bincount((3, 10), 20, [1.0] * 30, binary_output) + _test_dense_bincount((3, 10), 20, [1.5] * 30, binary_output) + _test_dense_bincount((3, 10), 20, None, binary_output) + # 1D input + _test_dense_bincount((10,), 20, [1.0] * 10, binary_output) + _test_dense_bincount((10,), 20, [1.5] * 10, binary_output) + _test_dense_bincount((10,), 20, None, binary_output) + + if __name__ == "__main__": pytest.main([__file__]) From 90fb79b74c49b585f39469e1a2eec233fdd592e0 Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Fri, 9 Sep 2022 16:24:05 +0100 Subject: [PATCH 136/704] [CI] Update Docker images to bring TF 2.9 and integration tests (#12738) [CI] Update Docker images to tag 20220908-060034-62bdc91b1 Updates all Docker images to tag 20220908-060034-62bdc91b1, to update TensorFlow/TFLite/Keras to 2.9, and cascaded dependencies such as numpy. Updates ethos-u-vela to 3.4.0. It also brings ONNX and PyTorch to ci_arm, to enable Integration tests to be run in CI. Standadises the minimum CMake version required in CI to be 3.18.4, fixing apps/microtvm/zephyr_cmsisnn to require this version. Finally, adds a new import error in the tutorials documentation which doesn't affect the final result. The new warning added is 'absl:Found untraced functions such as _jit_compiled_convolution_op' --- Jenkinsfile | 20 ++++++++++---------- ci/jenkins/Jenkinsfile.j2 | 20 ++++++++++---------- tests/scripts/task_config_build_cpu.sh | 11 ++++++++++- tests/scripts/task_python_docs.sh | 1 + 4 files changed, 31 insertions(+), 21 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 78071fde4599..ed1cf4b09e6e 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -49,16 +49,16 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> -ci_lint = 'tlcpack/ci-lint:20220810-060142-fae79bbc3' -ci_gpu = 'tlcpack/ci-gpu:20220810-060142-fae79bbc3' -ci_cpu = 'tlcpack/ci-cpu:20220810-060142-fae79bbc3' -ci_minimal = 'tlcpack/ci-minimal:20220725-133226-d3cefdaf1' -ci_wasm = 'tlcpack/ci-wasm:20220810-060142-fae79bbc3' -ci_i386 = 'tlcpack/ci-i386:20220810-060142-fae79bbc3' -ci_cortexm = 'tlcpack/ci-cortexm:20220810-060142-fae79bbc3' -ci_arm = 'tlcpack/ci-arm:20220810-060142-fae79bbc3' -ci_hexagon = 'tlcpack/ci-hexagon:20220825-145056-fb7cf97f' -ci_riscv = 'tlcpack/ci-riscv:20220810-060142-fae79bbc3' +ci_lint = 'tlcpack/ci-lint:20220908-060034-62bdc91b1' +ci_gpu = 'tlcpack/ci-gpu:20220908-060034-62bdc91b1' +ci_cpu = 'tlcpack/ci-cpu:20220908-060034-62bdc91b1' +ci_minimal = 'tlcpack/ci-minimal:20220908-060034-62bdc91b1' +ci_wasm = 'tlcpack/ci-wasm:20220908-060034-62bdc91b1' +ci_i386 = 'tlcpack/ci-i386:20220908-060034-62bdc91b1' +ci_cortexm = 'tlcpack/ci-cortexm:20220909-090211-cb08a1251' +ci_arm = 'tlcpack/ci-arm:20220908-060034-62bdc91b1' +ci_hexagon = 'tlcpack/ci-hexagon:20220908-060034-62bdc91b1' +ci_riscv = 'tlcpack/ci-riscv:20220908-060034-62bdc91b1' // <--- End of regex-scanned config. // Parameters to allow overriding (in Jenkins UI), the images diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2 index c932431a44a1..6ba0c2df8efd 100644 --- a/ci/jenkins/Jenkinsfile.j2 +++ b/ci/jenkins/Jenkinsfile.j2 @@ -51,16 +51,16 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils {% import 'ci/jenkins/macros.j2' as m with context -%} // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> -ci_lint = 'tlcpack/ci-lint:20220810-060142-fae79bbc3' -ci_gpu = 'tlcpack/ci-gpu:20220810-060142-fae79bbc3' -ci_cpu = 'tlcpack/ci-cpu:20220810-060142-fae79bbc3' -ci_minimal = 'tlcpack/ci-minimal:20220725-133226-d3cefdaf1' -ci_wasm = 'tlcpack/ci-wasm:20220810-060142-fae79bbc3' -ci_i386 = 'tlcpack/ci-i386:20220810-060142-fae79bbc3' -ci_cortexm = 'tlcpack/ci-cortexm:20220810-060142-fae79bbc3' -ci_arm = 'tlcpack/ci-arm:20220810-060142-fae79bbc3' -ci_hexagon = 'tlcpack/ci-hexagon:20220825-145056-fb7cf97f' -ci_riscv = 'tlcpack/ci-riscv:20220810-060142-fae79bbc3' +ci_lint = 'tlcpack/ci-lint:20220908-060034-62bdc91b1' +ci_gpu = 'tlcpack/ci-gpu:20220908-060034-62bdc91b1' +ci_cpu = 'tlcpack/ci-cpu:20220908-060034-62bdc91b1' +ci_minimal = 'tlcpack/ci-minimal:20220908-060034-62bdc91b1' +ci_wasm = 'tlcpack/ci-wasm:20220908-060034-62bdc91b1' +ci_i386 = 'tlcpack/ci-i386:20220908-060034-62bdc91b1' +ci_cortexm = 'tlcpack/ci-cortexm:20220909-090211-cb08a1251' +ci_arm = 'tlcpack/ci-arm:20220908-060034-62bdc91b1' +ci_hexagon = 'tlcpack/ci-hexagon:20220908-060034-62bdc91b1' +ci_riscv = 'tlcpack/ci-riscv:20220908-060034-62bdc91b1' // <--- End of regex-scanned config. // Parameters to allow overriding (in Jenkins UI), the images diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh index 9dc5c62efaa7..7f48839f23c0 100755 --- a/tests/scripts/task_config_build_cpu.sh +++ b/tests/scripts/task_config_build_cpu.sh @@ -37,7 +37,16 @@ echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake echo set\(USE_VTA_TSIM ON\) >> config.cmake echo set\(USE_VTA_FSIM ON\) >> config.cmake -echo set\(USE_TFLITE ON\) >> config.cmake + +# This conditional is just to support the transition to cope +# with the change in the way TFLite is built. It can be +# removed once we migrate to TensorFlow and TFLite > 2.9.1 +if [ -d "/opt/tflite" ]; then + echo set\(USE_TFLITE \"/opt/tflite\"\) >> config.cmake +else + echo set\(USE_TFLITE ON\) >> config.cmake +fi + echo set\(USE_TENSORFLOW_PATH \"/tensorflow\"\) >> config.cmake echo set\(USE_FLATBUFFERS_PATH \"/flatbuffers\"\) >> config.cmake echo set\(USE_ETHOSN /opt/arm/ethosn-driver\) >> config.cmake diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh index d8578fde2817..fdce77bfd9cc 100755 --- a/tests/scripts/task_python_docs.sh +++ b/tests/scripts/task_python_docs.sh @@ -88,6 +88,7 @@ IGNORED_WARNINGS=( 'autotvm:Cannot find config for target=cuda -keys=cuda,gpu' # Warning is thrown during TFLite quantization for micro_train tutorial 'absl:For model inputs containing unsupported operations which cannot be quantized, the `inference_input_type` attribute will default to the original type.' + 'absl:Found untraced functions such as _jit_compiled_convolution_op' ) JOINED_WARNINGS=$(join_by '|' "${IGNORED_WARNINGS[@]}") From 75969647fdf5e9f9b60635d1409952c97a29f0e4 Mon Sep 17 00:00:00 2001 From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com> Date: Fri, 9 Sep 2022 16:49:37 +0100 Subject: [PATCH 137/704] Aligned CMSIS-NN SHA in TVM to CMSIS top of tree (#12723) Aligned CMSIS-NN SHA in TVM to top of tree of CMSIS. -Aligned buffer size APIs to CMSIS implementations. -Updated the tests to match new CMSIS context buffer sizes. -This change needs updates to cortex-m docker image. Change-Id: I13f1ad29fe0ef02f08660eca4c818b5d66145ffc --- docker/install/ubuntu_install_cmsis.sh | 4 ++-- .../backend/contrib/cmsisnn/buffer_size.cc | 20 ++++++++++--------- .../backend/contrib/cmsisnn/buffer_size.h | 7 ++++++- .../backend/contrib/cmsisnn/relay_to_tir.cc | 4 ++-- .../contrib/cmsisnn/buffer_size_test.cc | 8 ++++---- 5 files changed, 25 insertions(+), 18 deletions(-) diff --git a/docker/install/ubuntu_install_cmsis.sh b/docker/install/ubuntu_install_cmsis.sh index 1116b5bd6929..9fcbcf61cefa 100755 --- a/docker/install/ubuntu_install_cmsis.sh +++ b/docker/install/ubuntu_install_cmsis.sh @@ -39,8 +39,8 @@ shift mkdir -p "${INSTALLATION_PATH}" # Download and extract CMSIS -CMSIS_SHA="e336766b1b5654f36244bca649917281f399bf37" -CMSIS_SHASUM="30c40824c4e008dcb9c6c77adee5115efa0cb04b6701fe2bc31ddf7be2da59f2161aeb4dbe5780cbaa709af23a3e21ea460bb2b84fa12418563125b4d426ac86" +CMSIS_SHA="51263182d16c92649a48144ba56c0945f9fce60e" +CMSIS_SHASUM="d02573e5a8908c741d8558f01be2939aae6e940933ccb58123fa972864947759eefe5d554688db3910c8ed665a248b477b5e4458e12773385c67f8a2136b3b34" CMSIS_URL="http://github.com/ARM-software/CMSIS_5/archive/${CMSIS_SHA}.tar.gz" DOWNLOAD_PATH="/tmp/${CMSIS_SHA}.tar.gz" diff --git a/src/relay/backend/contrib/cmsisnn/buffer_size.cc b/src/relay/backend/contrib/cmsisnn/buffer_size.cc index d03d34897f5a..25f4d054e810 100644 --- a/src/relay/backend/contrib/cmsisnn/buffer_size.cc +++ b/src/relay/backend/contrib/cmsisnn/buffer_size.cc @@ -17,6 +17,8 @@ * under the License. */ +#include "buffer_size.h" + #include #include @@ -44,13 +46,13 @@ int Conv2dBufferSize(Target target, int32_t padding_w, int32_t padding_h, int32_ } if (is1xN) { - if (!has_mve) { - return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t); + if (has_mve) { + return 0; } - return 0; + return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t); } - if (has_mve) { + if (has_mve || is1xN) { int32_t col_length = input_c * filter_w * filter_h; col_length = (col_length + 7) / 8; return 4 * col_length * 8 * (int32_t)sizeof(int8_t); @@ -61,15 +63,15 @@ int Conv2dBufferSize(Target target, int32_t padding_w, int32_t padding_h, int32_ } int DepthwiseConv2dBufferSize(Target target, int32_t input_n, int32_t input_c, int32_t output_c, - int32_t filter_w, int32_t filter_h) { + int32_t filter_w, int32_t filter_h, int32_t dilation_w, + int32_t dilation_h) { bool has_mve = target->GetFeature("has_mve").value_or(Bool(false)); bool has_dsp = target->GetFeature("has_dsp").value_or(Bool(false)); - if (input_c == output_c && input_n == 1) { + if (input_c == output_c && input_n == 1 && dilation_w == 1 && dilation_h == 1) { if (has_mve) { - return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t) + 4; - } - if (has_dsp) { + return (4 * CH_IN_BLOCK_MVE * filter_w * filter_h) * (int32_t)sizeof(int8_t); + } else if (has_dsp) { return (input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t); } } diff --git a/src/relay/backend/contrib/cmsisnn/buffer_size.h b/src/relay/backend/contrib/cmsisnn/buffer_size.h index a6d3d588e2d9..9dae17c0a220 100644 --- a/src/relay/backend/contrib/cmsisnn/buffer_size.h +++ b/src/relay/backend/contrib/cmsisnn/buffer_size.h @@ -34,6 +34,8 @@ namespace relay { namespace contrib { namespace cmsisnn { +#define CH_IN_BLOCK_MVE (124) + /*! * \brief Calculates the appropriate buffer size for CMSIS-NN Convolutions * See: @@ -70,11 +72,14 @@ int Conv2dBufferSize(Target target, int32_t padding_w, int32_t padding_h, int32_ * \param output_c - Output channels * \param filter_w - Filter width * \param filter_h - Filter height + * \param dilation_w - Dilation width + * \param dilation_h - Dilation height * * \return Size of buffer to allocate for depthwise convolution */ int DepthwiseConv2dBufferSize(Target target, int32_t input_n, int32_t input_c, int32_t output_c, - int32_t filter_w, int32_t filter_h); + int32_t filter_w, int32_t filter_h, int32_t dilation_w, + int32_t dilation_h); /*! * \brief Calculates the appropriate buffer size for CMSIS-NN Average Pooling diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc index 5683bc6698be..a5cdfd570fea 100644 --- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc +++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc @@ -242,8 +242,8 @@ class RelayToTIRVisitor : public MixedModeMutator { Target target = CreateTarget(transform::PassContext::Current()); size_t context_buffer_size; if (is_depthwise) { - context_buffer_size = - DepthwiseConv2dBufferSize(target, input_n, input_c, output_c, filter_w, filter_h); + context_buffer_size = DepthwiseConv2dBufferSize(target, input_n, input_c, output_c, filter_w, + filter_h, dilation_w, dilation_h); } else { context_buffer_size = Conv2dBufferSize(target, padding_w, padding_h, input_n, input_h, input_c, output_h, output_w, stride_w, stride_h, diff --git a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc index 9ff42e203ee6..d8870fa71525 100644 --- a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc +++ b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc @@ -143,7 +143,7 @@ TEST(CMSISNNDepthwiseConv2dBufferSize, UnEvenChannels) { int32_t input_n = 1; auto depthwise_conv2d_with_channels = [=](Target target, int32_t input_c, int32_t output_c) { - return DepthwiseConv2dBufferSize(target, input_n, input_c, output_c, filter_w, filter_h); + return DepthwiseConv2dBufferSize(target, input_n, input_c, output_c, filter_w, filter_h, 1, 1); }; ASSERT_EQ(depthwise_conv2d_with_channels(kNoExt, 4, 6), 0); @@ -161,7 +161,7 @@ TEST(CMSISNNDepthwiseConv2dBufferSize, MultipleBatches) { auto depthwise_conv2d_with_batch = [=](Target target, int32_t input_n) { return DepthwiseConv2dBufferSize(target, input_n, input_output_c, input_output_c, filter_w, - filter_h); + filter_h, 1, 1); }; ASSERT_EQ(depthwise_conv2d_with_batch(kNoExt, 4), 0); @@ -179,12 +179,12 @@ TEST(CMSISNNDepthwiseConv2dBufferSize, Default) { int32_t input_n = 1; int32_t mve_calculated_buffer = - (2 * input_output_c * filter_w * filter_h) * (int32_t)sizeof(int16_t) + 4; + (4 * CH_IN_BLOCK_MVE * filter_w * filter_h) * (int32_t)sizeof(int8_t); int32_t dsp_calculated_buffer = (input_output_c * filter_w * filter_h) * (int32_t)sizeof(int16_t); auto depthwise_conv2d = [=](Target target) { return DepthwiseConv2dBufferSize(target, input_n, input_output_c, input_output_c, filter_w, - filter_h); + filter_h, 1, 1); }; ASSERT_EQ(depthwise_conv2d(kNoExt), 0); From 1d32c400f1d2a14cb3c663c2d17b977b94b2db48 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Fri, 9 Sep 2022 09:25:30 -0700 Subject: [PATCH 138/704] [microtvm][Zephyr] Add project overlay to overwrite device tree configs (#12741) * add nucleo overlay --- .../app-overlay/nucleo_l4r5zi.overlay | 23 +++++++++++++++++++ .../template_project/microtvm_api_server.py | 15 ++++++++---- cmake/modules/Zephyr.cmake | 1 + tests/lint/check_file_type.py | 1 + 4 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay diff --git a/apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay b/apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay new file mode 100644 index 000000000000..360e0753d4f5 --- /dev/null +++ b/apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay @@ -0,0 +1,23 @@ + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +&rcc { + clock-frequency = ; +}; diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py index b73779f68148..5a0bc7309c63 100644 --- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py +++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py @@ -567,6 +567,8 @@ def _generate_cmake_args(self, mlf_extracted_path, options) -> str: return cmake_args def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options): + zephyr_board = options["zephyr_board"] + # Check Zephyr version version = self._get_platform_version(get_zephyr_base(options)) if version != ZEPHYR_VERSION: @@ -586,6 +588,11 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec # Copy boards.json file to generated project. shutil.copy2(BOARDS, project_dir / BOARDS.name) + # Copy overlay files + board_overlay_path = API_SERVER_DIR / "app-overlay" / f"{zephyr_board}.overlay" + if board_overlay_path.exists(): + shutil.copy2(board_overlay_path, project_dir / f"{zephyr_board}.overlay") + # Place Model Library Format tarball in the special location, which this script uses to decide # whether it's being invoked in a template or generated project. project_model_library_format_tar_path = project_dir / MODEL_LIBRARY_FORMAT_RELPATH @@ -597,9 +604,9 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec os.makedirs(extract_path) tf.extractall(path=extract_path) - if self._is_qemu(options["zephyr_board"], options.get("use_fvp")): + if self._is_qemu(zephyr_board, options.get("use_fvp")): shutil.copytree(API_SERVER_DIR / "qemu-hack", project_dir / "qemu-hack") - elif self._is_fvp(options["zephyr_board"], options.get("use_fvp")): + elif self._is_fvp(zephyr_board, options.get("use_fvp")): shutil.copytree(API_SERVER_DIR / "fvp-hack", project_dir / "fvp-hack") # Populate CRT. @@ -650,7 +657,7 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec for item in flags: cmake_f.write(f"target_compile_definitions(app PUBLIC {item})\n") - if self._is_fvp(options["zephyr_board"], options.get("use_fvp")): + if self._is_fvp(zephyr_board, options.get("use_fvp")): cmake_f.write(f"target_compile_definitions(app PUBLIC -DFVP=1)\n") self._create_prj_conf(project_dir, options) @@ -665,7 +672,7 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec # Populate src/ src_dir = project_dir / "src" if options["project_type"] != "host_driven" or self._is_fvp( - options["zephyr_board"], options.get("use_fvp") + zephyr_board, options.get("use_fvp") ): shutil.copytree(API_SERVER_DIR / "src" / options["project_type"], src_dir) else: diff --git a/cmake/modules/Zephyr.cmake b/cmake/modules/Zephyr.cmake index be4f85dac33d..644675dcf871 100644 --- a/cmake/modules/Zephyr.cmake +++ b/cmake/modules/Zephyr.cmake @@ -29,6 +29,7 @@ if(USE_MICRO) "apps/microtvm/zephyr/template_project/src/host_driven *.h -> zephyr/src/host_driven" "apps/microtvm/zephyr/template_project/fvp-hack * -> zephyr/fvp-hack" "apps/microtvm/zephyr/template_project/qemu-hack * -> zephyr/qemu-hack" + "apps/microtvm/zephyr/template_project/app-overlay * -> zephyr/app-overlay" "apps/microtvm/zephyr/template_project/crt_config *.h -> zephyr/crt_config" ) diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py index 7e09c3c7cfa6..51a80431d37f 100644 --- a/tests/lint/check_file_type.py +++ b/tests/lint/check_file_type.py @@ -148,6 +148,7 @@ "apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-riscv32", "apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-riscv64", "apps/microtvm/zephyr/template_project/fvp-hack/FVP_Corstone_SSE-300_Ethos-U55", + "apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay", # microTVM Virtual Machines "apps/microtvm/poetry.lock", "apps/microtvm/reference-vm/Vagrantfile", From 8bd81e6fbca3b7c8511b3b24601c37a3cff19864 Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Fri, 9 Sep 2022 11:54:38 -0700 Subject: [PATCH 139/704] [TVMScript] Base IRBuilder methods for `PrimFunc` (#12745) Base IRBuilder methods for `PrimFunc` This PR introduces base IRBuilder methods for `PrimFunc`. Co-authored-by: yongwww Co-authored-by: yongwww --- include/tvm/script/ir_builder/ir/frame.h | 2 + include/tvm/script/ir_builder/ir/ir.h | 2 + include/tvm/script/ir_builder/tir/frame.h | 155 ++++++++++++++++++ include/tvm/script/ir_builder/tir/ir.h | 48 ++++++ python/tvm/script/ir_builder/tir/__init__.py | 18 ++ python/tvm/script/ir_builder/tir/_ffi_api.py | 20 +++ python/tvm/script/ir_builder/tir/frame.py | 31 ++++ python/tvm/script/ir_builder/tir/ir.py | 55 +++++++ src/script/ir_builder/ir/frame.cc | 2 + src/script/ir_builder/ir/ir.cc | 2 + src/script/ir_builder/tir/frame.cc | 59 +++++++ src/script/ir_builder/tir/ir.cc | 50 ++++++ src/script/ir_builder/tir/utils.h | 68 ++++++++ .../unittest/test_tvmscript_ir_builder_tir.py | 49 ++++++ 14 files changed, 561 insertions(+) create mode 100644 include/tvm/script/ir_builder/tir/frame.h create mode 100644 include/tvm/script/ir_builder/tir/ir.h create mode 100644 python/tvm/script/ir_builder/tir/__init__.py create mode 100644 python/tvm/script/ir_builder/tir/_ffi_api.py create mode 100644 python/tvm/script/ir_builder/tir/frame.py create mode 100644 python/tvm/script/ir_builder/tir/ir.py create mode 100644 src/script/ir_builder/tir/frame.cc create mode 100644 src/script/ir_builder/tir/ir.cc create mode 100644 src/script/ir_builder/tir/utils.h create mode 100644 tests/python/unittest/test_tvmscript_ir_builder_tir.py diff --git a/include/tvm/script/ir_builder/ir/frame.h b/include/tvm/script/ir_builder/ir/frame.h index 181774bc53bc..887981ccffc8 100644 --- a/include/tvm/script/ir_builder/ir/frame.h +++ b/include/tvm/script/ir_builder/ir/frame.h @@ -29,6 +29,7 @@ namespace tvm { namespace script { namespace ir_builder { +namespace ir { /*! * \brief A frame that represents the IRModule frame with functions and global variables. @@ -64,6 +65,7 @@ class IRModuleFrame : public IRBuilderFrame { IRModuleFrameNode); }; +} // namespace ir } // namespace ir_builder } // namespace script } // namespace tvm diff --git a/include/tvm/script/ir_builder/ir/ir.h b/include/tvm/script/ir_builder/ir/ir.h index 0bd5473c7eaf..f0e7cc6f5c2f 100644 --- a/include/tvm/script/ir_builder/ir/ir.h +++ b/include/tvm/script/ir_builder/ir/ir.h @@ -29,6 +29,7 @@ namespace tvm { namespace script { namespace ir_builder { +namespace ir { /*! * \brief The IRModule declaration statement. @@ -36,6 +37,7 @@ namespace ir_builder { */ TVM_DLL IRModuleFrame IRModule(); +} // namespace ir } // namespace ir_builder } // namespace script } // namespace tvm diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h new file mode 100644 index 000000000000..4bfd022af27a --- /dev/null +++ b/include/tvm/script/ir_builder/tir/frame.h @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_SCRIPT_IR_BUILDER_TIR_FRAME_H_ +#define TVM_SCRIPT_IR_BUILDER_TIR_FRAME_H_ + +#include +#include +#include + +namespace tvm { +namespace script { +namespace ir_builder { +namespace tir { + +/*! + * \brief A base frame that represents the TIR fame with body of statements. + * + * \sa TIRFrame + */ +class TIRFrameNode : public IRBuilderFrameNode { + public: + /*! \brief The Stmt within in this frame. */ + Array stmts; + + void VisitAttrs(tvm::AttrVisitor* v) { + IRBuilderFrameNode::VisitAttrs(v); + v->Visit("stmts", &stmts); + } + + static constexpr const char* _type_key = "script.ir_builder.tir.TIRFrame"; + TVM_DECLARE_BASE_OBJECT_INFO(TIRFrameNode, IRBuilderFrameNode); +}; + +/*! + * \brief Managed reference to TIRFrameNode. + * + * \sa TIRFrameNode + */ +class TIRFrame : public IRBuilderFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(TIRFrame, IRBuilderFrame, TIRFrameNode); + + protected: + TIRFrame() = default; +}; + +/*! + * \brief A frame that represents the PrimFunc containing TIR statements. + * + * \sa PrimFuncFrame + */ +class PrimFuncFrameNode : public TIRFrameNode { + public: + /*! \brief The name of the block. */ + Optional name; + /*! \brief Function parameters. */ + Array args; + /*! \brief The return type of the function. */ + Optional ret_type; + /*! \brief Maps some parameters to specific Buffer data structures. */ + Map buffer_map; + /*! \brief The buffer map prior to flattening. */ + Map preflattened_buffer_map; + /*! \brief Additional attributes storing the meta-data */ + Optional> attrs; + /*! \brief The variable map bound to thread env. */ + Map env_threads; + /*! \brief The buffer allocated in root block. */ + Array root_alloc_buffers; + + void VisitAttrs(tvm::AttrVisitor* v) { + TIRFrameNode::VisitAttrs(v); + v->Visit("name", &name); + v->Visit("args", &args); + v->Visit("ret_type", &ret_type); + v->Visit("buffer_map", &buffer_map); + v->Visit("preflattened_buffer_map", &preflattened_buffer_map); + v->Visit("attrs", &attrs); + v->Visit("env_threads", &env_threads); + v->Visit("root_alloc_buffers", &root_alloc_buffers); + } + + static constexpr const char* _type_key = "script.ir_builder.tir.PrimFuncFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(PrimFuncFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to PrimFuncFrameNode. + * + * \sa PrimFuncFrameNode + */ +class PrimFuncFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(PrimFuncFrame, TIRFrame, PrimFuncFrameNode); +}; + +/*! + * \brief A frame that represents the assert statement. Proceeds if the condition is true, + * otherwise aborts with the message. + * + * \sa AssertFrame + */ +class AssertFrameNode : public TIRFrameNode { + public: + /*! \brief The PrimExpr to test. */ + PrimExpr condition; + /*! \brief The output error message when the assertion failed. */ + PrimExpr message; + + void VisitAttrs(tvm::AttrVisitor* v) { + TIRFrameNode::VisitAttrs(v); + v->Visit("condition", &condition); + v->Visit("message", &message); + } + + static constexpr const char* _type_key = "script.ir_builder.tir.AssertFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(AssertFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +} // namespace tir +} // namespace ir_builder +} // namespace script +} // namespace tvm + +#endif // TVM_SCRIPT_IR_BUILDER_TIR_FRAME_H_ diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h new file mode 100644 index 000000000000..cee60ad4f827 --- /dev/null +++ b/include/tvm/script/ir_builder/tir/ir.h @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_SCRIPT_IR_BUILDER_TIR_IR_H_ +#define TVM_SCRIPT_IR_BUILDER_TIR_IR_H_ + +#include +#include +#include + +namespace tvm { +namespace script { +namespace ir_builder { +namespace tir { + +/*! + * \brief The primitive function statement. + * \return The PrimFuncFrame. + */ +PrimFuncFrame PrimFunc(); + +/*! + * \brief Evaluate the input expression. + * \param value The input expression to evaluate. + */ +void Evaluate(PrimExpr value); + +} // namespace tir +} // namespace ir_builder +} // namespace script +} // namespace tvm + +#endif // TVM_SCRIPT_IR_BUILDER_TIR_IR_H_ diff --git a/python/tvm/script/ir_builder/tir/__init__.py b/python/tvm/script/ir_builder/tir/__init__.py new file mode 100644 index 000000000000..1e43d1af3498 --- /dev/null +++ b/python/tvm/script/ir_builder/tir/__init__.py @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Package tvm.script.ir_builder.tir""" +from .ir import * # pylint: disable=wildcard-import,redefined-builtin diff --git a/python/tvm/script/ir_builder/tir/_ffi_api.py b/python/tvm/script/ir_builder/tir/_ffi_api.py new file mode 100644 index 000000000000..876f5f3a35a0 --- /dev/null +++ b/python/tvm/script/ir_builder/tir/_ffi_api.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""FFI APIs""" +import tvm._ffi + +tvm._ffi._init_api("script.ir_builder.tir", __name__) # pylint: disable=protected-access diff --git a/python/tvm/script/ir_builder/tir/frame.py b/python/tvm/script/ir_builder/tir/frame.py new file mode 100644 index 000000000000..61418e0b2aa6 --- /dev/null +++ b/python/tvm/script/ir_builder/tir/frame.py @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""IRBuilder for TIR""" + +from tvm._ffi import register_object as _register_object + +from ..base import IRBuilderFrame + + +@_register_object("script.ir_builder.tir.TIRFrame") +class TIRFrame(IRBuilderFrame): + ... + + +@_register_object("script.ir_builder.tir.PrimFuncFrame") +class PrimFuncFrame(TIRFrame): + ... diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py new file mode 100644 index 000000000000..ae5d5b260f65 --- /dev/null +++ b/python/tvm/script/ir_builder/tir/ir.py @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-docstring +"""IRBuilder for TIR""" + +from tvm.tir import PrimExpr, StringImm + +from . import _ffi_api, frame + + +def prim_func() -> frame.PrimFuncFrame: + """The primitive function statement. + + Returns + ------- + res : frame.PrimFuncFrame + The PrimFuncFrame. + """ + return _ffi_api.PrimFunc() # pylint: disable=no-member # type: ignore + + +def evaluate(value: PrimExpr) -> None: + """Evaluate the input expression. + + Parameters + ---------- + value: PrimExpr + The input expression to evaluate. + """ + if isinstance(value, str): + value = StringImm(value) + return _ffi_api.Evaluate(value) # pylint: disable=no-member # type: ignore + + +# pylint: enable=invalid-name + + +__all__ = [ + "evaluate", + "prim_func", +] diff --git a/src/script/ir_builder/ir/frame.cc b/src/script/ir_builder/ir/frame.cc index c85e30544aca..a81c56922dff 100644 --- a/src/script/ir_builder/ir/frame.cc +++ b/src/script/ir_builder/ir/frame.cc @@ -23,6 +23,7 @@ namespace tvm { namespace script { namespace ir_builder { +namespace ir { void IRModuleFrameNode::ExitWithScope() { ICHECK_EQ(functions.size(), global_vars.size()); @@ -38,6 +39,7 @@ void IRModuleFrameNode::ExitWithScope() { TVM_REGISTER_NODE_TYPE(IRModuleFrameNode); +} // namespace ir } // namespace ir_builder } // namespace script } // namespace tvm diff --git a/src/script/ir_builder/ir/ir.cc b/src/script/ir_builder/ir/ir.cc index bcd21de144bb..a8cc452e4f0c 100644 --- a/src/script/ir_builder/ir/ir.cc +++ b/src/script/ir_builder/ir/ir.cc @@ -23,6 +23,7 @@ namespace tvm { namespace script { namespace ir_builder { +namespace ir { IRModuleFrame IRModule() { ObjectPtr n = make_object(); @@ -33,6 +34,7 @@ IRModuleFrame IRModule() { TVM_REGISTER_GLOBAL("script.ir_builder.ir.IRModule").set_body_typed(IRModule); +} // namespace ir } // namespace ir_builder } // namespace script } // namespace tvm diff --git a/src/script/ir_builder/tir/frame.cc b/src/script/ir_builder/tir/frame.cc new file mode 100644 index 000000000000..139c8193b0ba --- /dev/null +++ b/src/script/ir_builder/tir/frame.cc @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include +#include + +#include "../../../tir/ir/script/script_complete.h" +#include "./utils.h" + +namespace tvm { +namespace script { +namespace ir_builder { +namespace tir { + +void PrimFuncFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + tvm::tir::PrimFunc func( + /*params=*/args, + /*body=*/AsStmt(stmts), + /*ret_type=*/ret_type.value_or(TupleType::Empty()), + /*buffer_map=*/buffer_map, + /*preflattened_buffer_map=*/preflattened_buffer_map, + /*attrs=*/attrs.defined() ? DictAttrs(attrs.value()) : NullValue()); + func = tvm::tir::ScriptComplete(func, root_alloc_buffers); + IRBuilder builder = IRBuilder::Current(); + if (builder->frames.empty()) { + ICHECK(!builder->result.defined()) << "ValueError: Builder.result has already been set"; + builder->result = func; + } else if (Optional opt_frame = builder->FindFrame()) { + ir::IRModuleFrame frame = opt_frame.value(); + frame->global_vars.push_back(GlobalVar(name.value_or(""))); + frame->functions.push_back(func); + } else { + LOG(FATAL) << "ValueError: Cannot find where to insert PrimFunc"; + } +} + +TVM_REGISTER_NODE_TYPE(TIRFrameNode); +TVM_REGISTER_NODE_TYPE(PrimFuncFrameNode); + +} // namespace tir +} // namespace ir_builder +} // namespace script +} // namespace tvm diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc new file mode 100644 index 000000000000..5f994d71ca0a --- /dev/null +++ b/src/script/ir_builder/tir/ir.cc @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include +#include + +#include "./utils.h" + +namespace tvm { +namespace script { +namespace ir_builder { +namespace tir { + +using tvm::tir::IterVar; + +PrimFuncFrame PrimFunc() { + ObjectPtr n = make_object(); + n->name = NullOpt; + n->args.clear(); + n->ret_type = NullOpt; + n->buffer_map.clear(); + n->preflattened_buffer_map.clear(); + n->attrs = NullOpt; + n->env_threads.clear(); + n->root_alloc_buffers.clear(); + return PrimFuncFrame(n); +} + +void Evaluate(PrimExpr value) { AddToParent(tvm::tir::Evaluate(value)); } +TVM_REGISTER_GLOBAL("script.ir_builder.tir.PrimFunc").set_body_typed(PrimFunc); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate); +} // namespace tir +} // namespace ir_builder +} // namespace script +} // namespace tvm diff --git a/src/script/ir_builder/tir/utils.h b/src/script/ir_builder/tir/utils.h new file mode 100644 index 000000000000..47557917cca5 --- /dev/null +++ b/src/script/ir_builder/tir/utils.h @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_SCRIPT_IR_BUILDER_TIR_UTILS_H_ +#define TVM_SCRIPT_IR_BUILDER_TIR_UTILS_H_ + +#include +#include +#include + +namespace tvm { +namespace script { +namespace ir_builder { +namespace tir { + +inline void AddToParent(tvm::tir::Stmt stmt) { + IRBuilder builder = IRBuilder::Current(); + if (builder->frames.empty()) { + ICHECK(!builder->result.defined()) << "ValueError: Builder.result has already been set"; + builder->result = stmt; + } else if (const auto* tir_frame = builder->frames.back().as()) { + GetRef(tir_frame)->stmts.push_back(stmt); + } else { + LOG(FATAL) << "TypeError: Unsupported frame type: " << builder->frames.back(); + } +} + +inline tvm::tir::Stmt AsStmt(const Array& stmt) { + using namespace tvm::tir; + if (stmt.empty()) { + return tvm::tir::Evaluate(0); + } else if (stmt.size() == 1) { + return stmt[0]; + } else { + return SeqStmt(stmt); + } +} + +inline PrimFuncFrame FindPrimFuncFrame(const String& method) { + if (Optional frame = IRBuilder::Current()->GetLastFrame()) { + return frame.value(); + } + LOG(FATAL) << "ValueError: PrimFunc frame not find. Please ensure '" << method + << "' is called under T.prim_func()"; + throw; +} + +} // namespace tir +} // namespace ir_builder +} // namespace script +} // namespace tvm + +#endif // TVM_SCRIPT_IR_BUILDER_TIR_UTILS_H_ diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py new file mode 100644 index 000000000000..70a8f3565d03 --- /dev/null +++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, missing-docstring +"""Unittests for tvm.script.ir_builder.tir""" +import pytest +import tvm.testing +import tvm +from tvm import tir +from tvm.script.ir_builder import tir as T +from tvm.script.ir_builder import IRBuilder +from tvm.ir.base import assert_structural_equal + + +def test_ir_builder_tir_primfunc(): + with IRBuilder() as ib: + with T.prim_func(): + T.evaluate(0) + # the prim_func generated by IRBuilder + prim_func_actual = ib.get() + + # the expected prim_func + prim_func_expected = tir.PrimFunc( + params=[], + body=tir.Evaluate(0), + ret_type=None, + buffer_map=None, + preflattened_buffer_map=None, + attrs=None, + ) + # Check if the generated ir is expected + assert_structural_equal(prim_func_actual, prim_func_expected, map_free_vars=True) + + +if __name__ == "__main__": + tvm.testing.main() From 14999f8add61b1a81a0f733ba12aadf2b8057279 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Fri, 9 Sep 2022 13:59:55 -0500 Subject: [PATCH 140/704] [TVMScript][TIR] Clarify scope of BlockNode::iter_vars (#12726) Previously, it was ambiguous whether `BlockNode::iter_vars` were in-scope for `BlockRealizeNode::predicate`. `ConvertBlocksToOpaque` treated them as in-scope, and applied a mapping from `iter_vars` to `iter_values`. Similarly, TVMScript printing places `T.where` statements below the `T.axis` statements, where `T.axis` definitions are in scope. However, `BlockRealizeNode::SEqualReduce` and `BlockRealizeNode::SHashReduce` do not visit the block and `iter_vars` until after visiting the predicate, placing the `iter_vars` out of scope. This commit updates the printing of `T.where` to be above `T.axis`, and updates `ConvertBlocksToOpaque` to report an error if the predicate contains references to `BlockNode::iter_vars`. After this commit, these three usages all consistently treat `BlockNode::iter_vars` as out of scope for `BlockRealizeNode::predicate`. --- src/printer/tvmscript_printer.cc | 24 ++++++++---- .../transforms/convert_blocks_to_opaque.cc | 39 +++++++++++++++---- ..._tir_transform_convert_blocks_to_opaque.py | 16 +++++++- 3 files changed, 62 insertions(+), 17 deletions(-) diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc index 5da81de4dc5d..20720373589f 100644 --- a/src/printer/tvmscript_printer.cc +++ b/src/printer/tvmscript_printer.cc @@ -283,6 +283,7 @@ class TVMScriptPrinter : public StmtFunctor, Doc AllocBufferDeclaration(const Buffer& buf); Doc PrintBlockVar(const IterVar& iter_var, const PrimExpr& value); Doc PrintBlockVarRemaps(); + Doc PrintBlockPredicate(const BlockRealizeNode* op); Doc PrintBlockVars(const BlockRealizeNode* op); Doc PrintBlockAttr(const BlockRealizeNode* op); Doc PrintExpandedArray(const ArrayNode* op); @@ -1417,6 +1418,14 @@ Doc TVMScriptPrinter::PrintBlockVarRemaps() { return doc; } +Doc TVMScriptPrinter::PrintBlockPredicate(const BlockRealizeNode* op) { + Doc doc; + if (!is_one(op->predicate)) { + doc << Doc::NewLine() << tir_prefix_ << ".where(" << Print(op->predicate) << ")"; + } + return doc; +} + Doc TVMScriptPrinter::PrintBlockVars(const BlockRealizeNode* op) { Doc doc; const auto* block_op = op->block.as(); @@ -1457,10 +1466,7 @@ Doc TVMScriptPrinter::PrintBlockVars(const BlockRealizeNode* op) { Doc TVMScriptPrinter::PrintBlockAttr(const BlockRealizeNode* op) { const auto* block_op = op->block.as(); Doc block_attr_doc; - // print predicate, binding, read/write tensor region, annotations - if (!is_one(op->predicate)) { - block_attr_doc << Doc::NewLine() << tir_prefix_ << ".where(" << Print(op->predicate) << ")"; - } + // print binding, read/write tensor region, annotations block_attr_doc << Doc::NewLine() << tir_prefix_ << ".reads(" << PrintExpandedArray(block_op->reads.as()) << ")"; block_attr_doc << Doc::NewLine() << tir_prefix_ << ".writes(" @@ -1523,14 +1529,18 @@ Doc TVMScriptPrinter::PrintBlockName(const BlockNode* block_op) { Doc TVMScriptPrinter::VisitStmt_(const BlockRealizeNode* op) { const auto* block_op = op->block.as(); Doc doc = PrintOptionalInfo(GetRef(block_op)); - // print block name and block vars + // print block name doc << PrintBlockName(block_op); + // Print block predicate. + Doc block_predicate = PrintBlockPredicate(op); + // Print the variable bindings, valid to use in block attributes and + // body Doc block_var = PrintBlockVars(op); - // print predicate, binding, read/write tensor region, annotations + // print read/write tensor region, annotations Doc block_attr_doc = PrintBlockAttr(op); // print body Doc body = PrintBlockBody(block_op); - doc << Doc::Indent(4, block_var << block_attr_doc << Doc::NewLine() << body); + doc << Doc::Indent(4, block_predicate << block_var << block_attr_doc << Doc::NewLine() << body); for (const auto& iter_var : block_op->iter_vars) { TryDeallocVar(iter_var->var); } diff --git a/src/tir/transforms/convert_blocks_to_opaque.cc b/src/tir/transforms/convert_blocks_to_opaque.cc index ddc2e1756908..95648713494c 100644 --- a/src/tir/transforms/convert_blocks_to_opaque.cc +++ b/src/tir/transforms/convert_blocks_to_opaque.cc @@ -45,6 +45,10 @@ class OpaqueBlockConverter : public StmtExprMutator { OpaqueBlockConverter() = default; PrimExpr VisitExpr_(const VarNode* var) final { + CHECK(!forbidden_iter_vars_.count(var)) + << "Variable " << var->name_hint << " occurs in the predicate or iter_values of a block, " + << "but isn't defined until the body of the block"; + auto it = var_substitutes_.find(var); if (it != var_substitutes_.end()) { return it->second; @@ -65,23 +69,42 @@ class OpaqueBlockConverter : public StmtExprMutator { Stmt VisitStmt_(const BlockRealizeNode* realize) final { const auto* block_op = realize->block.get(); ICHECK(!block_op->init.defined()); - // Step 1. Update "block vars => binding values" for substitution. - ICHECK_EQ(block_op->iter_vars.size(), realize->iter_values.size()); + + // Step 1. Visit the predicate and iter_values, without any variable bindings + for (const auto& iter : block_op->iter_vars) forbidden_iter_vars_.insert(iter->var.get()); + PrimExpr predicate = VisitExpr(realize->predicate); + Array iter_values = realize->iter_values; + iter_values.MutateByApply([this](PrimExpr expr) { return VisitExpr(std::move(expr)); }); + for (const auto& iter : block_op->iter_vars) forbidden_iter_vars_.erase(iter->var.get()); + + // Step 2. Update "block vars => binding values" for substitution. + ICHECK_EQ(block_op->iter_vars.size(), iter_values.size()); for (int i = 0, n = block_op->iter_vars.size(); i < n; ++i) { IterVar block_var = block_op->iter_vars[i]; - PrimExpr v = this->VisitExpr(realize->iter_values[i]); + PrimExpr v = this->VisitExpr(iter_values[i]); var_substitutes_.emplace(block_var->var.get(), v); } - // Step 2. Visit recursively. - BlockRealize new_realize = Downcast(StmtExprMutator::VisitStmt_(realize)); - if (!new_realize->iter_values.empty()) { - new_realize.CopyOnWrite()->iter_values.clear(); + // Step 3. Visit recursively. + Block new_block = Downcast(VisitStmt(realize->block)); + + // Step 4. Clear the variable bindings + for (const auto& block_var : block_op->iter_vars) { + var_substitutes_.erase(block_var->var.get()); + } + + // Step 5. Return + if (predicate.same_as(realize->predicate) && iter_values.same_as(realize->iter_values) && + new_block.same_as(realize->block) && realize->iter_values.size() == 0) { + return GetRef(realize); + } else { + return BlockRealize({}, predicate, new_block); } - return std::move(new_realize); } /*! \brief The map from block vars to their binding values. */ std::unordered_map var_substitutes_; + /* \brief Variables that may not occur in the current context */ + std::unordered_set forbidden_iter_vars_; }; PrimFunc ConvertBlocksToOpaque(PrimFunc f) { diff --git a/tests/python/unittest/test_tir_transform_convert_blocks_to_opaque.py b/tests/python/unittest/test_tir_transform_convert_blocks_to_opaque.py index 6859a5d75b75..297943bc1381 100644 --- a/tests/python/unittest/test_tir_transform_convert_blocks_to_opaque.py +++ b/tests/python/unittest/test_tir_transform_convert_blocks_to_opaque.py @@ -82,6 +82,18 @@ def test_lower_te(): tvm.ir.assert_structural_equal(mod, orig_mod) # ConvertBlocksToOpaque should do nothing on TE +class TestErrorIfPredicateUsesBlockVariables(tvm.testing.CompareBeforeAfter): + transform = tvm.tir.transform.ConvertBlocksToOpaque() + + def before(A: T.Buffer[8, "int32"]): + for i in T.serial(8): + with T.block(): + vi = T.axis.remap("S", [i]) + T.where(vi < 6) + T.evaluate(0) + + expected = tvm.TVMError + + if __name__ == "__main__": - test_elementwise() - test_lower_te() + tvm.testing.main() From 574794e915ba424db05e1ddcf2218f37b2b65764 Mon Sep 17 00:00:00 2001 From: Matveenko Valery <50880524+valmat07@users.noreply.github.com> Date: Fri, 9 Sep 2022 21:01:53 +0200 Subject: [PATCH 141/704] [OpenCL] Enable OpenCL for GPU tests (#12490) * Add opencl target in test build script * Fix fp16 test and compile test for opencl * fix lint * Fix relay OpenCL texture tests * Fix lint * Enable relay OpenCL tests * Fix opencl relay texture tests * fix lint * Remove OpenCL gtest variable * Fix unbound variable * Skip tests that are not supported in CI * fix lint * Add path for opencl gtest directory * Fix opencl gtests include directory * Enable OpenCL googletest. Fix bug in opencl timer test * testing fix for build cpp tests * update googletest git version for opencl tests build * update cmakelist * Update CMakeList * Update CMakeList * Disable opencl googletests * update Opecnl.cmake * fix Opecnl.cmake * Apply comments. Remove xfail decerator for opencl tests. Now specific tests are skipped in the environment script * minor code changes * apply comments * apply comment * skip test in ci by decorator * fix pytest skipif warnings * Fix skipif for opencl gtests --- src/runtime/opencl/opencl_common.h | 2 +- tests/cpp-runtime/opencl/opencl_timer_test.cc | 1 + tests/cpp-runtime/opencl/run_gtests.cc | 2 +- .../contrib/test_opencl/test_run_gtests.py | 1 + tests/python/driver/tvmc/test_compiler.py | 3 +- .../test_conv2d_nchw_texture.py | 107 +++++++----------- .../test_conv2d_nhwc_texture.py | 92 ++++++--------- .../test_depthwise_conv2d_nchw_texture.py | 26 ++--- .../test_depthwise_conv2d_nhwc_texture.py | 32 ++---- .../utils/adreno_utils.py | 0 .../unittest/test_target_codegen_vulkan.py | 3 + tests/scripts/task_config_build_gpu.sh | 1 + tests/scripts/task_python_integration.sh | 6 +- .../task_python_integration_gpuonly.sh | 3 +- 14 files changed, 112 insertions(+), 167 deletions(-) rename tests/python/relay/{ => opencl_texture}/test_conv2d_nchw_texture.py (90%) rename tests/python/relay/{ => opencl_texture}/test_conv2d_nhwc_texture.py (87%) rename tests/python/relay/{ => opencl_texture}/test_depthwise_conv2d_nchw_texture.py (91%) rename tests/python/relay/{ => opencl_texture}/test_depthwise_conv2d_nhwc_texture.py (91%) rename tests/python/relay/{ => opencl_texture}/utils/adreno_utils.py (100%) diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index f16e1e936d96..7f7f083cf303 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -439,9 +439,9 @@ class OpenCLTimerNode : public TimerNode { public: // Timer start virtual void Start() { + this->duration = 0; if (count_timer_execs == 0) { cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).clear(); - this->duration = 0; // Very first call of Start() leads to the recreation of // OpenCL command queue in profiling mode. This allows to run profile after inference. recreateCommandQueue(); diff --git a/tests/cpp-runtime/opencl/opencl_timer_test.cc b/tests/cpp-runtime/opencl/opencl_timer_test.cc index 40ec65d8dfe2..6faf2f6a1482 100644 --- a/tests/cpp-runtime/opencl/opencl_timer_test.cc +++ b/tests/cpp-runtime/opencl/opencl_timer_test.cc @@ -46,6 +46,7 @@ TEST(OpenCLTimerNode, nested_timers) { cl_mem cl_buf = clCreateBuffer(workspace->context, CL_MEM_READ_ONLY, BUFF_SIZE * sizeof(cl_int), NULL, &err); OPENCL_CHECK_ERROR(err); + queue = workspace->GetQueue(thr->device); OPENCL_CALL(clEnqueueWriteBuffer(queue, cl_buf, false, 0, BUFF_SIZE * sizeof(cl_int), tmp_buf, 0, NULL, &ev)); OPENCL_CALL(clReleaseMemObject(cl_buf)); diff --git a/tests/cpp-runtime/opencl/run_gtests.cc b/tests/cpp-runtime/opencl/run_gtests.cc index b16ae3efc74d..ffe86a7f52c0 100644 --- a/tests/cpp-runtime/opencl/run_gtests.cc +++ b/tests/cpp-runtime/opencl/run_gtests.cc @@ -40,7 +40,7 @@ TVM_REGISTER_GLOBAL("opencl.run_gtests").set_body([](TVMArgs args, TVMRetValue* argv.push_back(const_cast("opencl_run_gtests")); // add parsed arguments - for (int i = 0; i < parsed_args.size(); ++i) { + for (size_t i = 0; i < parsed_args.size(); ++i) { argv.push_back(const_cast(parsed_args[i].data())); } diff --git a/tests/python/contrib/test_opencl/test_run_gtests.py b/tests/python/contrib/test_opencl/test_run_gtests.py index 4afcf7ee8d66..ee59086b25f1 100644 --- a/tests/python/contrib/test_opencl/test_run_gtests.py +++ b/tests/python/contrib/test_opencl/test_run_gtests.py @@ -28,6 +28,7 @@ # for example to run all "foo" tests twice and observe gtest output run # pytest -sv --gtests_args="--gtest_filter=*foo* --gtest_repeat=2" @tvm.testing.requires_opencl +@pytest.mark.skipif(tvm.testing.utils.IS_IN_CI, reason="failed due to nvidia libOpencl in the CI") def test_run_gtests(gtest_args): if ( "TVM_TRACKER_HOST" in os.environ diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py index 27cd78d436c7..5535fc02249f 100644 --- a/tests/python/driver/tvmc/test_compiler.py +++ b/tests/python/driver/tvmc/test_compiler.py @@ -367,8 +367,9 @@ def test_compile_opencl(tflite_mobilenet_v1_0_25_128): tvmc_model = tvmc.load(tflite_mobilenet_v1_0_25_128) tvmc_package = tvmc.compile( tvmc_model, - target="opencl --host=llvm", + target="opencl -host=llvm", desired_layout="NCHW", + dump_code="asm", ) dumps_path = tvmc_package.package_path + ".asm" diff --git a/tests/python/relay/test_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py similarity index 90% rename from tests/python/relay/test_conv2d_nchw_texture.py rename to tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py index ab12e40b39cb..504a2b4e3ed3 100644 --- a/tests/python/relay/test_conv2d_nchw_texture.py +++ b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py @@ -22,13 +22,15 @@ from tvm.relay import testing from tvm.contrib import utils from utils.adreno_utils import gpu_preprocess, build_run_compare +import pytest -@tvm.testing.requires_opencl -def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(): - target = "opencl --device=adreno" - dtype = "float16" +dtype = tvm.testing.parameter("float32") + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(target, dtype): input_shape = (1, 32, 42, 42) filter_shape = (96, 32, 3, 3) bias_shape = (1, 96, 1, 1) @@ -67,10 +69,8 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(): @tvm.testing.requires_opencl -def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(target, dtype): input_shape = (1, 32, 40, 40) filter_shape = (96, 32, 2, 2) bias_shape = (1, 96, 1, 1) @@ -109,10 +109,8 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(): @tvm.testing.requires_opencl -def test_conv2d_inceptionv3_35_35_strides(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_inceptionv3_35_35_strides(target, dtype): input_shape = (1, 48, 35, 35) filter_shape = (64, 48, 5, 5) bias_shape = (1, 64, 1, 1) @@ -151,10 +149,8 @@ def test_conv2d_inceptionv3_35_35_strides(): @tvm.testing.requires_opencl -def test_conv2d_resnet50_v2_nchw_3c(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_resnet50_v2_nchw_3c(target, dtype): input_shape = (1, 3, 224, 224) filter_shape = (64, 3, 7, 7) bias_shape = (1, 64, 1, 1) @@ -194,10 +190,8 @@ def test_conv2d_resnet50_v2_nchw_3c(): @tvm.testing.requires_opencl -def test_conv2d_inceptionv3_nchw_3c(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_inceptionv3_nchw_3c(target, dtype): input_shape = (1, 3, 299, 299) filter_shape = (64, 3, 3, 3) bias_shape = (1, 64, 1, 1) @@ -236,10 +230,8 @@ def test_conv2d_inceptionv3_nchw_3c(): @tvm.testing.requires_opencl -def test_conv2d_1x1_16c16spatial(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_1x1_16c16spatial(target, dtype): input_shape = (1, 16, 256, 256) filter_shape = (32, 16, 4, 4) bias_shape = (1, 32, 1, 1) @@ -278,10 +270,8 @@ def test_conv2d_1x1_16c16spatial(): @tvm.testing.requires_opencl -def test_conv2d_4x4_16c16pad(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_4x4_16c16pad(target, dtype): input_shape = (1, 32, 256, 256) filter_shape = (32, 32, 4, 4) bias_shape = (1, 32, 1, 1) @@ -320,10 +310,8 @@ def test_conv2d_4x4_16c16pad(): @tvm.testing.requires_opencl -def test_conv2d_4x4x4_16c16pad(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_4x4x4_16c16pad(target, dtype): input_shape = (1, 32, 256, 256) filter_shape = (4, 32, 4, 4) bias_shape = (1, 4, 1, 1) @@ -362,10 +350,8 @@ def test_conv2d_4x4x4_16c16pad(): @tvm.testing.requires_opencl -def test_conv2d_yolov3_v2_nchw_3c(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_yolov3_v2_nchw_3c(target, dtype): input_shape = (1, 1024, 13, 13) filter_shape = (255, 1024, 1, 1) A = relay.var("data", shape=input_shape, dtype=dtype) @@ -397,10 +383,8 @@ def test_conv2d_yolov3_v2_nchw_3c(): @tvm.testing.requires_opencl -def test_conv2d_vgg16_winograd_4d(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_vgg16_winograd_4d(target, dtype): input_shape = (1, 512, 28, 28) filter_shape = (512, 512, 3, 3) bias_shape = (1, 512, 1, 1) @@ -437,7 +421,7 @@ def test_conv2d_vgg16_winograd_4d(): stat_file = temp.relpath("stat.log") with open(stat_file, "w") as f: f.write( - '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 512, 28, 28], "float16"], ["TENSOR", [512, 512, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' + f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 512, 28, 28], "{dtype}"], ["TENSOR", [512, 512, 3, 3], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n' ) graph = build_run_compare( mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file @@ -447,10 +431,8 @@ def test_conv2d_vgg16_winograd_4d(): @tvm.testing.requires_opencl -def test_conv2d_winograd_conv(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_winograd_conv(target, dtype): input_shape = (1, 4, 3, 3) A = relay.var("data", shape=input_shape, dtype=dtype) filter_shape3 = (8, 4, 3, 3) @@ -486,7 +468,7 @@ def test_conv2d_winograd_conv(): stat_file = temp.relpath("stat.log") with open(stat_file, "w") as f: f.write( - '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 4, 3, 3], "float16"], ["TENSOR", [8, 4, 3, 3], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' + f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nchw_winograd.image2d", [["TENSOR", [1, 4, 3, 3], "{dtype}"], ["TENSOR", [8, 4, 3, 3], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n' ) graph = build_run_compare( mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file @@ -496,7 +478,9 @@ def test_conv2d_winograd_conv(): @tvm.testing.requires_opencl -def test_residual_block(): +@tvm.testing.parametrize_targets("opencl -device=adreno") +@pytest.mark.skipif(tvm.testing.utils.IS_IN_CI, reason="failed due to nvidia libOpencl in the CI") +def test_residual_block(target, dtype): """ - some kind of residual block followed by convolution to have texture after residual block - scalar data type verification which should be mapped to global memory scope @@ -515,9 +499,6 @@ def test_residual_block(): | <- buffer layout_transform (NCHW4c->NCHW) """ - target = "opencl --device=adreno" - dtype = "float16" - input_shape = (1, 32, 40, 40) filter_shape1 = (32, 32, 2, 2) filter_shape2 = (32, 32, 1, 1) @@ -555,7 +536,7 @@ def test_residual_block(): kernel_size=(1, 1), ) D = relay.op.add(conv2, D) - D = D * relay.const(0.15, "float16") + D = D * relay.const(0.15, dtype) D = relay.op.nn.relu(D) conv3 = relay.nn.conv2d( @@ -607,7 +588,8 @@ def test_residual_block(): @tvm.testing.requires_opencl -def test_concat(): +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_concat(target, dtype): """ layout_transform (NCHW->NCHW4c) | <- buffer @@ -619,9 +601,6 @@ def test_concat(): | <- buffer layout_transform (NCHW4c->NCHW) """ - target = "opencl --device=adreno" - dtype = "float16" - input_shape = (1, 32, 40, 40) filter_shape1 = (96, 32, 2, 2) filter_shape2 = (32, 96, 2, 2) @@ -721,7 +700,8 @@ def test_concat(): @tvm.testing.requires_opencl -def test_pooling_branching_texture_params(): +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_pooling_branching_texture_params(target, dtype): """ Verification of the pooling and many branches having textures layout_transform (NCHW->NCHW4c) @@ -738,9 +718,6 @@ def test_pooling_branching_texture_params(): | <- buffer layout_transform (NCHW4c->NCHW) """ - target = "opencl --device=adreno" - dtype = "float16" - input_shape = (1, 32, 40, 40) filter_shape0 = (32, 32, 1, 1) filter_shape1 = (32, 32, 2, 2) @@ -849,7 +826,8 @@ def test_pooling_branching_texture_params(): @tvm.testing.requires_opencl -def test_branching_texture_params(): +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_branching_texture_params(target, dtype): """ Verification of passing texture to several consumers markup of relay variables in primary functions + on_device @@ -866,9 +844,6 @@ def test_branching_texture_params(): | <- buffer layout_transform (NCHW4c->NCHW) """ - target = "opencl --device=adreno" - dtype = "float16" - input_shape = (1, 32, 40, 40) filter_shape0 = (32, 32, 1, 1) filter_shape1 = (32, 32, 2, 2) @@ -976,7 +951,8 @@ def test_branching_texture_params(): # function repeat, params scope are different in reused functions @tvm.testing.requires_opencl -def test_conv2d_different_lowering_same_op(): +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_different_lowering_same_op(target, dtype): """ Use case for verification of caching compiled functions Three convolutions following by each other in this case should be @@ -993,9 +969,6 @@ def test_conv2d_different_lowering_same_op(): | <- buffer layout_transform (NCHW4c->NCHW) """ - target = "opencl --device=adreno" - dtype = "float16" - input_shape = (1, 32, 40, 40) filter_shape1 = (32, 32, 1, 1) A = relay.var("data", shape=input_shape, dtype=dtype) diff --git a/tests/python/relay/test_conv2d_nhwc_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py similarity index 87% rename from tests/python/relay/test_conv2d_nhwc_texture.py rename to tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py index cf8116c076cc..37c22137f035 100644 --- a/tests/python/relay/test_conv2d_nhwc_texture.py +++ b/tests/python/relay/opencl_texture/test_conv2d_nhwc_texture.py @@ -23,13 +23,15 @@ from tvm.relay import testing from tvm.contrib import utils from utils.adreno_utils import gpu_preprocess, build_run_compare +import pytest -@tvm.testing.requires_opencl -def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16(): - target = "opencl --device=adreno" - dtype = "float16" +dtype = tvm.testing.parameter("float32") + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16(target, dtype): input_shape = (1, 257, 257, 32) filter_shape = (1, 1, 32, 16) bias_shape = (filter_shape[-1],) @@ -65,10 +67,8 @@ def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16(): @tvm.testing.requires_opencl -def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding(target, dtype): input_shape = (1, 257, 257, 32) filter_shape = (1, 1, 32, 16) bias_shape = (filter_shape[-1],) @@ -107,10 +107,8 @@ def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding(): @tvm.testing.requires_opencl -def test_conv2d_4_35_35_32x3_3_144_16(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_4_35_35_32x3_3_144_16(target, dtype): input_shape = (4, 35, 35, 32) filter_shape = (3, 3, 32, 16) bias_shape = (filter_shape[-1],) @@ -147,10 +145,8 @@ def test_conv2d_4_35_35_32x3_3_144_16(): @tvm.testing.requires_opencl -def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32(target, dtype): input_shape = (1, 513, 513, 3) filter_shape = (3, 3, 3, 32) bias_shape = (filter_shape[-1],) @@ -187,10 +183,8 @@ def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32(): @tvm.testing.requires_opencl -def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(target, dtype): input_shape = (1, 42, 42, 32) filter_shape = (3, 3, 32, 96) bias_shape = (1, 1, 1, 96) @@ -229,10 +223,8 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad(): @tvm.testing.requires_opencl -def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(target, dtype): input_shape = (1, 40, 40, 32) filter_shape = (2, 2, 32, 96) bias_shape = (1, 1, 1, 96) @@ -271,10 +263,8 @@ def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass(): @tvm.testing.requires_opencl -def test_conv2d_inceptionv3_35_35_strides(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_inceptionv3_35_35_strides(target, dtype): input_shape = (1, 35, 35, 48) filter_shape = (5, 5, 48, 64) bias_shape = (1, 1, 1, 64) @@ -313,10 +303,8 @@ def test_conv2d_inceptionv3_35_35_strides(): @tvm.testing.requires_opencl -def test_conv2d_resnet50_v2_nhwc_3c(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_resnet50_v2_nhwc_3c(target, dtype): input_shape = (1, 224, 224, 3) filter_shape = (7, 7, 3, 64) bias_shape = (1, 1, 1, 64) @@ -356,10 +344,8 @@ def test_conv2d_resnet50_v2_nhwc_3c(): @tvm.testing.requires_opencl -def test_conv2d_inceptionv3_nhwc_3c(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_inceptionv3_nhwc_3c(target, dtype): input_shape = (1, 299, 299, 3) filter_shape = (3, 3, 3, 64) bias_shape = (1, 1, 1, 64) @@ -398,11 +384,9 @@ def test_conv2d_inceptionv3_nhwc_3c(): @tvm.testing.requires_opencl -def test_conv2d_1x1_16c16spatial(): - target = "opencl --device=adreno" - dtype = "float16" - - input_shape = (1, 256, 256, 16) +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_1x1_16c16spatial(target, dtype): + input_shape = (1, 128, 128, 16) filter_shape = (4, 4, 16, 32) bias_shape = (1, 1, 1, 32) A = relay.var("data", shape=input_shape, dtype=dtype) @@ -440,10 +424,8 @@ def test_conv2d_1x1_16c16spatial(): @tvm.testing.requires_opencl -def test_conv2d_4x4_16c16pad(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_4x4_16c16pad(target, dtype): input_shape = (1, 256, 256, 32) filter_shape = (4, 4, 32, 32) bias_shape = (1, 1, 1, 32) @@ -482,10 +464,8 @@ def test_conv2d_4x4_16c16pad(): @tvm.testing.requires_opencl -def test_conv2d_4x4x4_16c16pad(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_4x4x4_16c16pad(target, dtype): input_shape = (1, 256, 256, 32) filter_shape = (4, 4, 32, 4) bias_shape = (1, 1, 1, 4) @@ -523,10 +503,8 @@ def test_conv2d_4x4x4_16c16pad(): @tvm.testing.requires_opencl -def test_conv2d_yolov3_v2_nhwc_3c(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_yolov3_v2_nhwc_3c(target, dtype): input_shape = (1, 13, 13, 1024) filter_shape = (1, 1, 1024, 255) A = relay.var("data", shape=input_shape, dtype=dtype) @@ -558,10 +536,8 @@ def test_conv2d_yolov3_v2_nhwc_3c(): @tvm.testing.requires_opencl -def test_conv2d_vgg16_winograd_4d(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_conv2d_vgg16_winograd_4d(target, dtype): input_shape = (1, 28, 28, 512) filter_shape = (3, 3, 512, 512) bias_shape = (1, 1, 1, 512) @@ -598,7 +574,7 @@ def test_conv2d_vgg16_winograd_4d(): stat_file = temp.relpath("stat.log") with open(stat_file, "w") as f: f.write( - '{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 28, 28, 512], "float16"], ["TENSOR", [3, 3, 512, 512], "float16"], [1, 1], [1, 1, 1, 1], [1, 1], "float16"], {}], "config": {"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}\n' + f'{{"input": ["opencl -keys=adreno,opencl,gpu -device=adreno -max_num_threads=256", "conv2d_nhwc_winograd.image2d", [["TENSOR", [1, 28, 28, 512], "{dtype}"], ["TENSOR", [3, 3, 512, 512], "{dtype}"], [1, 1], [1, 1, 1, 1], [1, 1], "{dtype}"], {{}}], "config": {{"index": 1591, "code_hash": null, "entity": [["auto_unroll_max_step", "ot", 4], ["tile_y", "sp", [-1, 1, 32]], ["tile_x", "sp", [-1, 4, 2]], ["tile_rc", "sp", [-1, 8]]]}}, "result": [[0.0037244], 0, 7.06374192237854, 1653898629.7427933], "version": 0.2, "tvm_version": "0.8.dev0"}}\n' ) graph = build_run_compare( mod, params1, {"data": input_shape}, dtype, target, stat_file=stat_file diff --git a/tests/python/relay/test_depthwise_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py similarity index 91% rename from tests/python/relay/test_depthwise_conv2d_nchw_texture.py rename to tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py index c94d085b5115..0ac92d03b6f9 100644 --- a/tests/python/relay/test_depthwise_conv2d_nchw_texture.py +++ b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nchw_texture.py @@ -22,12 +22,12 @@ from tvm.relay import testing from utils.adreno_utils import gpu_preprocess, build_run_compare +dtype = tvm.testing.parameter("float32") -@tvm.testing.requires_opencl -def test_depthwise_conv2d_bias_nchwc(): - target = "opencl --device=adreno" - dtype = "float16" +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_depthwise_conv2d_bias_nchwc(target, dtype): input_shape = (1, 64, 112, 112) filter_shape = (64, 1, 3, 3) bias_shape = (1, 64, 1, 1) @@ -68,10 +68,8 @@ def test_depthwise_conv2d_bias_nchwc(): @tvm.testing.requires_opencl -def test_depthwise_conv2d_nchwc(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_depthwise_conv2d_nchwc(target, dtype): input_shape = (1, 64, 112, 112) filter_shape = (64, 1, 3, 3) bias_shape = (1, 64, 1, 1) @@ -107,10 +105,8 @@ def test_depthwise_conv2d_nchwc(): @tvm.testing.requires_opencl -def test_depthwise_conv2d_bias_nchw(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_depthwise_conv2d_bias_nchw(target, dtype): input_shape = (1, 64, 112, 112) filter_shape = (64, 1, 3, 3) bias_shape = (1, 64, 1, 1) @@ -151,10 +147,8 @@ def test_depthwise_conv2d_bias_nchw(): @tvm.testing.requires_opencl -def test_depthwise_conv2d_repack_bias_nchw(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_depthwise_conv2d_repack_bias_nchw(target, dtype): input_shape = (1, 63, 112, 112) filter_shape = (63, 1, 3, 3) bias_shape = (1, 63, 1, 1) diff --git a/tests/python/relay/test_depthwise_conv2d_nhwc_texture.py b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py similarity index 91% rename from tests/python/relay/test_depthwise_conv2d_nhwc_texture.py rename to tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py index 16f9b8749909..3af7db3a4e1f 100644 --- a/tests/python/relay/test_depthwise_conv2d_nhwc_texture.py +++ b/tests/python/relay/opencl_texture/test_depthwise_conv2d_nhwc_texture.py @@ -22,12 +22,12 @@ from tvm.relay import testing from utils.adreno_utils import build_run_compare +dtype = tvm.testing.parameter("float32") -@tvm.testing.requires_opencl -def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1(): - target = "opencl --device=adreno" - dtype = "float16" +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1(target, dtype): input_shape = (1, 129, 129, 144) filter_shape = (3, 3, 144, 1) kernel_size = (filter_shape[0], filter_shape[1]) @@ -66,10 +66,8 @@ def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1(): @tvm.testing.requires_opencl -def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1(target, dtype): input_shape = (4, 35, 35, 576) filter_shape = (3, 3, 576, 1) kernel_size = (filter_shape[0], filter_shape[1]) @@ -108,10 +106,8 @@ def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1(): @tvm.testing.requires_opencl -def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding(target, dtype): input_shape = (1, 129, 129, 144) filter_shape = (3, 3, 144, 1) kernel_size = (filter_shape[0], filter_shape[1]) @@ -152,10 +148,8 @@ def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding(): @tvm.testing.requires_opencl -def test_depthwise_conv2d_1_513_513_7x3_3_7_1(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_depthwise_conv2d_1_513_513_7x3_3_7_1(target, dtype): input_shape = (1, 513, 513, 7) filter_shape = (3, 3, 7, 1) bias_shape = (filter_shape[2],) @@ -193,10 +187,8 @@ def test_depthwise_conv2d_1_513_513_7x3_3_7_1(): @tvm.testing.requires_opencl -def test_depthwise_conv2d_1_513_513_3x3_3_3_1(): - target = "opencl --device=adreno" - dtype = "float16" - +@tvm.testing.parametrize_targets("opencl -device=adreno") +def test_depthwise_conv2d_1_513_513_3x3_3_3_1(target, dtype): input_shape = (1, 513, 513, 3) filter_shape = (3, 3, 3, 1) bias_shape = (filter_shape[2],) diff --git a/tests/python/relay/utils/adreno_utils.py b/tests/python/relay/opencl_texture/utils/adreno_utils.py similarity index 100% rename from tests/python/relay/utils/adreno_utils.py rename to tests/python/relay/opencl_texture/utils/adreno_utils.py diff --git a/tests/python/unittest/test_target_codegen_vulkan.py b/tests/python/unittest/test_target_codegen_vulkan.py index 73e840208549..76cad250e053 100644 --- a/tests/python/unittest/test_target_codegen_vulkan.py +++ b/tests/python/unittest/test_target_codegen_vulkan.py @@ -16,6 +16,7 @@ # under the License. import os +from posixpath import split import random import re import threading @@ -91,6 +92,8 @@ def test_array_copy(dev, dtype, fuzz_seed): def test_array_vectorize_add(target, dev, dtype): arr_size = 64 lanes = 2 + if "opencl" in target and dtype == "float16": + pytest.xfail("Opencl target does not support float16") num_thread = 8 diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh index f79076e213cb..5163a16da3cd 100755 --- a/tests/scripts/task_config_build_gpu.sh +++ b/tests/scripts/task_config_build_gpu.sh @@ -28,6 +28,7 @@ echo set\(USE_CUDNN ON\) >> config.cmake echo set\(USE_CUDA ON\) >> config.cmake echo set\(USE_VULKAN ON\) >> config.cmake echo set\(USE_OPENGL ON\) >> config.cmake +echo set\(USE_OPENCL ON\) >> config.cmake echo set\(USE_MICRO ON\) >> config.cmake echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake echo set\(USE_LLVM \"/usr/bin/llvm-config-9 --link-static\"\) >> config.cmake diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh index fc7cbf3a88e7..5eac7b45ba61 100755 --- a/tests/scripts/task_python_integration.sh +++ b/tests/scripts/task_python_integration.sh @@ -61,12 +61,14 @@ run_pytest cython ${TVM_INTEGRATION_TESTSUITE_NAME}-dso_plugin_module-1 apps/dso run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-integration tests/python/integration # Ignoring Arm(R) Ethos(TM)-U NPU tests in the collective to run to run them in parallel in the next step. -run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib tests/python/contrib --ignore=tests/python/contrib/test_ethosu --ignore=tests/python/contrib/test_cmsisnn - +run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib tests/python/contrib --ignore=tests/python/contrib/test_ethosu --ignore=tests/python/contrib/test_cmsisnn # forked is needed because the global registry gets contaminated TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm;cuda}" \ run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-relay tests/python/relay +# OpenCL texture test. Deselected specific tests that fails in CI +TVM_TEST_TARGETS="${TVM_RELAY_OPENCL_TEXTURE_TARGETS:-opencl}" \ + run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-opencl-texture tests/python/relay/opencl_texture # Command line driver test run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-driver tests/python/driver diff --git a/tests/scripts/task_python_integration_gpuonly.sh b/tests/scripts/task_python_integration_gpuonly.sh index 3ce5571caa0e..432984c95561 100755 --- a/tests/scripts/task_python_integration_gpuonly.sh +++ b/tests/scripts/task_python_integration_gpuonly.sh @@ -18,9 +18,10 @@ set -exo pipefail -export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;nvptx;opencl -device=mali,aocl_sw_emu" +export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;nvptx;opencl -device=mali,aocl_sw_emu,adreno" export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS" export TVM_RELAY_TEST_TARGETS="cuda" +export TVM_RELAY_OPENCL_TEXTURE_TARGETS="opencl -device=adreno" export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-gpu export TVM_INTEGRATION_GPU_ONLY=1 From b21bf6638bc9a0b339bdbebeae9630ddb583b5a9 Mon Sep 17 00:00:00 2001 From: Black <32191045+blackkker@users.noreply.github.com> Date: Sat, 10 Sep 2022 03:02:16 +0800 Subject: [PATCH 142/704] [Frontend][Paddle] Fix op in paddle did't transmit layout information (#12658) [Frontend][Paddle] Fix adaptive_avg_pool2d in paddle did't transmit layout information --- python/tvm/relay/frontend/paddlepaddle.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py index a869e2e1b807..9b909895e084 100644 --- a/python/tvm/relay/frontend/paddlepaddle.py +++ b/python/tvm/relay/frontend/paddlepaddle.py @@ -1193,6 +1193,7 @@ def convert_pool2d(g, op, block): paddings = op.attr("paddings") padding_algorithm = op.attr("padding_algorithm") pooling_type = op.attr("pooling_type") + data_format = op.attr("data_format") if global_pooling: adaptive = True @@ -1260,7 +1261,9 @@ def convert_pool2d(g, op, block): input_x, pool_size=ksize, strides=strides, padding=paddings, ceil_mode=ceil_mode ) else: - out = getattr(_op.nn, "adaptive_" + op_map[pooling_type])(input_x, output_size=ksize) + out = getattr(_op.nn, "adaptive_" + op_map[pooling_type])( + input_x, output_size=ksize, layout=data_format + ) g.add_node(op.output("Out")[0], out) From 029fa462d22ce3c75bc5ea530eece999a160c05b Mon Sep 17 00:00:00 2001 From: wrongtest Date: Sat, 10 Sep 2022 03:10:50 +0800 Subject: [PATCH 143/704] [TIR][Arith] Add more strict checking in imm construction and folding. (#12515) * Add more strict check in tir imm construction and folding. * fix bool-compare compile error * fix some illegal imm construction in testcases * do not test i64 overflow behaviour because it is not consistent on cython and ctypes * fix float32 testcase * auto-inferred dtype should be int64 when value exceeds int32 range * add floatimm range check for fp16 and fp32 * add more folding testcases and fix store fp32 folding result to double * fix i386 fp16 cases --- include/tvm/tir/op.h | 9 +- python/tvm/runtime/object_generic.py | 14 +- python/tvm/script/tir/intrin.py | 5 + src/arith/const_fold.h | 112 +++- src/ir/expr.cc | 32 +- src/support/scalars.cc | 4 - src/support/scalars.h | 4 + tests/python/relay/test_op_level4.py | 2 +- tests/python/relay/test_pass_fuse_ops.py | 2 +- .../unittest/test_arith_rewrite_simplify.py | 2 + .../unittest/test_target_codegen_cuda.py | 7 +- tests/python/unittest/test_tir_imm_values.py | 577 ++++++++++++++++++ .../test_tir_transform_narrow_datatype.py | 9 - 13 files changed, 743 insertions(+), 36 deletions(-) create mode 100644 tests/python/unittest/test_tir_imm_values.py diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h index b4c5d45cbf8e..0939e25efddf 100644 --- a/include/tvm/tir/op.h +++ b/include/tvm/tir/op.h @@ -911,7 +911,9 @@ inline PrimExpr MakeConstScalar(DataType t, ValueType value, Span span = Span()) if (t.is_uint()) { // Use IntImm if it is a small integer uint64_t uval = static_cast(value); - if (uval <= static_cast(std::numeric_limits::max())) { + if (value < static_cast(0)) { + LOG(FATAL) << "cannot make uint from negative value " << value; + } else if (uval <= static_cast(std::numeric_limits::max())) { return IntImm(t, static_cast(value), span); } else { uint64_t mask = (static_cast(1) << 32U) - 1U; @@ -932,6 +934,11 @@ inline PrimExpr MakeConstScalar(DataType t, ValueType value, Span span = Span()) return PrimExpr(); } +template <> +inline PrimExpr MakeConstScalar(DataType t, bool value, Span span) { + return MakeConstScalar(t, static_cast(value), span); +} + template inline PrimExpr make_const(DataType t, ValueType value, Span span) { if (t.lanes() == 1) { diff --git a/python/tvm/runtime/object_generic.py b/python/tvm/runtime/object_generic.py index 7a55d3ef244e..05426dfb1aeb 100644 --- a/python/tvm/runtime/object_generic.py +++ b/python/tvm/runtime/object_generic.py @@ -115,11 +115,17 @@ def _scalar_type_inference(value): elif isinstance(value, bool): dtype = "bool" elif isinstance(value, float): - # We intentionally convert the float to float32 since it's more common in DL. - dtype = "float32" + # We intentionally prefer convert the float to float32 since it's more common in DL. + if -3.40282347e38 <= value <= 3.40282347e38: + dtype = "float32" + else: + dtype = "float64" elif isinstance(value, int): - # We intentionally convert the python int to int32 since it's more common in DL. - dtype = "int32" + # We intentionally prefer convert the python int to int32 since it's more common in DL. + if -2147483648 <= value <= 2147483647: + dtype = "int32" + else: + dtype = "int64" else: raise NotImplementedError( "Cannot automatically inference the type." " value={}".format(value) diff --git a/python/tvm/script/tir/intrin.py b/python/tvm/script/tir/intrin.py index f3919afe5a24..bd9aa1fdadfd 100644 --- a/python/tvm/script/tir/intrin.py +++ b/python/tvm/script/tir/intrin.py @@ -89,6 +89,11 @@ def truncmod(x, y, span): return tvm.tir.truncmod(x, y, span) +@register +def truncdiv(x, y, span): + return tvm.tir.truncdiv(x, y, span) + + @register def ceildiv(x, y, span): return tvm.tir.ceildiv(x, y, span) diff --git a/src/arith/const_fold.h b/src/arith/const_fold.h index 9c3afe41b901..d0e09a1a7429 100644 --- a/src/arith/const_fold.h +++ b/src/arith/const_fold.h @@ -29,6 +29,7 @@ #include #include +#include #include "int_operator.h" @@ -73,6 +74,39 @@ inline bool IsIndexType(const DataType& type) { return type.is_int() && type.lanes() == 1 && (type.bits() == 32 || type.bits() == 64); } +/*! \brief Helper to get const folding result repr in int64. */ +inline int64_t GetFoldResultInt64Repr(int64_t x, const DataType& dtype) { + if (dtype.bits() < 64) { + x &= (1LL << dtype.bits()) - 1; + } + if (dtype.is_int()) { + // get sign extended value of integer with specified bits + int64_t m = 1LL << (dtype.bits() - 1); + x = (x ^ m) - m; + } + return x; +} + +/*! \brief Helper to get fp32 const folding result repr in double. */ +inline double GetFoldResultDoubleRepr(float x) { + double res = static_cast(x); + if (std::isinf(res) || std::isnan(res)) { + return res; + } + // certain platform (eg, on gcc7-i386) do the folding arithmetic + // on float and write back to double is optimized to double + // precision arithmetic, this is legal and we check the output + // range thus to ensure consistency when the float result is inf. + if (res < std::numeric_limits::lowest()) { + LOG(WARNING) << "underlying float value overflow"; + return -std::numeric_limits::infinity(); + } else if (res > std::numeric_limits::max()) { + LOG(WARNING) << "underlying float value overflow"; + return std::numeric_limits::infinity(); + } + return res; +} + #define TVM_ARITH_CONST_PROPAGATION(BODY) \ using tir::FloatImmNode; \ const IntImmNode* pa = a.as(); \ @@ -95,10 +129,22 @@ template <> inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); - if (pa && pb) return IntImm(rtype, pa->value + pb->value); + if (pa && pb) { + int64_t res = pa->value + pb->value; + return IntImm(rtype, GetFoldResultInt64Repr(res, rtype)); + } if (pa && pa->value == 0) return b; if (pb && pb->value == 0) return a; - if (fa && fb) return FloatImm(rtype, fa->value + fb->value); + if (fa && fb) { + if (rtype.bits() == 32) { + return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast(fa->value) + + static_cast(fb->value))); + } else if (rtype.bits() == 64) { + return FloatImm(rtype, fa->value + fb->value); + } else { + return PrimExpr(); + } + } if (fa && fa->value == 0) return b; if (fb && fb->value == 0) return a; }); @@ -113,9 +159,21 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { << "Checked failed. Minuend 's value is 0U and it's dtype is uint " << "while Subtrahend's dtype is uint; which will cause a negative uint"; const DataType& rtype = a.dtype(); - if (pa && pb) return IntImm(rtype, pa->value - pb->value); + if (pa && pb) { + int64_t res = pa->value - pb->value; + return IntImm(rtype, GetFoldResultInt64Repr(res, rtype)); + } if (pb && pb->value == 0) return a; - if (fa && fb) return FloatImm(rtype, fa->value - fb->value); + if (fa && fb) { + if (rtype.bits() == 32) { + return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast(fa->value) - + static_cast(fb->value))); + } else if (rtype.bits() == 64) { + return FloatImm(rtype, fa->value - fb->value); + } else { + return PrimExpr(); + } + } if (fb && fb->value == 0) return a; }); return PrimExpr(); @@ -125,7 +183,10 @@ template <> inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); - if (pa && pb) return IntImm(rtype, pa->value * pb->value); + if (pa && pb) { + int64_t res = pa->value * pb->value; + return IntImm(rtype, GetFoldResultInt64Repr(res, rtype)); + } if (pa) { if (pa->value == 1) return b; if (pa->value == 0) return a; @@ -134,7 +195,16 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { if (pb->value == 1) return a; if (pb->value == 0) return b; } - if (fa && fb) return FloatImm(rtype, fa->value * fb->value); + if (fa && fb) { + if (rtype.bits() == 32) { + return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast(fa->value) * + static_cast(fb->value))); + } else if (rtype.bits() == 64) { + return FloatImm(rtype, fa->value * fb->value); + } else { + return PrimExpr(); + } + } if (fa) { if (fa->value == 1) return b; if (fa->value == 0) return a; @@ -155,7 +225,8 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { // due to division and mod can have different modes // NOTE: this will assumes truc div. ICHECK_NE(pb->value, 0) << "Divide by zero"; - return IntImm(rtype, pa->value / pb->value); + int64_t res = pa->value / pb->value; + return IntImm(rtype, GetFoldResultInt64Repr(res, rtype)); } if (pa) { if (pa->value == 0) return a; @@ -165,7 +236,14 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { ICHECK_NE(pb->value, 0) << "Divide by zero"; } if (fa && fb && fb->value != 0) { - return FloatImm(rtype, fa->value / fb->value); + if (rtype.bits() == 32) { + return FloatImm(rtype, GetFoldResultDoubleRepr(static_cast(fa->value) / + static_cast(fb->value))); + } else if (rtype.bits() == 64) { + return FloatImm(rtype, fa->value / fb->value); + } else { + return PrimExpr(); + } } if (fa && fa->value == 0) return a; if (fb) { @@ -182,7 +260,8 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { const DataType& rtype = a.dtype(); if (pa && pb) { ICHECK_NE(pb->value, 0) << "Divide by zero"; - return IntImm(rtype, pa->value % pb->value); + int64_t res = pa->value % pb->value; + return IntImm(rtype, GetFoldResultInt64Repr(res, rtype)); } if (pa) { if (pa->value == 0) return a; @@ -201,7 +280,8 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { const DataType& rtype = a.dtype(); if (pa && pb) { ICHECK_NE(pb->value, 0) << "Divide by zero"; - return IntImm(rtype, arith::floordiv(pa->value, pb->value)); + int64_t res = arith::floordiv(pa->value, pb->value); + return IntImm(rtype, GetFoldResultInt64Repr(res, rtype)); } if (pa) { if (pa->value == 0) return a; @@ -211,7 +291,14 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { ICHECK_NE(pb->value, 0) << "Divide by zero"; } if (fa && fb && fb->value != 0) { - return FloatImm(rtype, std::floor(fa->value / fb->value)); + if (rtype.bits() == 32) { + return FloatImm(rtype, GetFoldResultDoubleRepr(std::floor(static_cast(fa->value) / + static_cast(fb->value)))); + } else if (rtype.bits() == 64) { + return FloatImm(rtype, std::floor(fa->value / fb->value)); + } else { + return PrimExpr(); + } } if (fa && fa->value == 0) return a; if (fb) { @@ -228,7 +315,8 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { const DataType& rtype = a.dtype(); if (pa && pb) { ICHECK_NE(pb->value, 0) << "Divide by zero"; - return IntImm(rtype, floormod(pa->value, pb->value)); + int64_t res = arith::floormod(pa->value, pb->value); + return IntImm(rtype, GetFoldResultInt64Repr(res, rtype)); } if (pa) { if (pa->value == 0) return a; diff --git a/src/ir/expr.cc b/src/ir/expr.cc index d3e23800d6c7..c926cc56e89a 100644 --- a/src/ir/expr.cc +++ b/src/ir/expr.cc @@ -33,6 +33,8 @@ #include #include +#include "../support/scalars.h" + namespace tvm { PrimExpr::PrimExpr(int32_t value) : PrimExpr(IntImm(DataType::Int(32), value)) {} @@ -76,7 +78,20 @@ IntImm::IntImm(DataType dtype, int64_t value, Span span) { ICHECK(dtype.is_int() || dtype.is_uint()) << "ValueError: IntImm supports only int or uint type, but " << dtype << " was supplied."; if (dtype.is_uint()) { - ICHECK_GE(value, 0U); + ICHECK_GE(value, 0U) << "ValueError: Literal value " << value + << " is negative for unsigned integer type " << dtype; + if (dtype.bits() < 64) { + ICHECK_LT(value, 1LL << dtype.bits()) + << "ValueError: Literal value " << value << " exceeds maximum of " << dtype; + } + } else if (dtype.bits() == 1) { + // int(1) + ICHECK(value == 0 || value == 1) << "ValueError: " << value << " exceeds range of " << dtype; + } else if (dtype.bits() < 64) { + ICHECK_GE(value, -(1LL << (dtype.bits() - 1))) + << "ValueError: Literal value " << value << " exceeds minimum of " << dtype; + ICHECK_LT(value, 1LL << (dtype.bits() - 1)) + << "ValueError: Literal value " << value << " exceeds maximum of " << dtype; } ObjectPtr node = make_object(); node->dtype = dtype; @@ -103,6 +118,21 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) FloatImm::FloatImm(DataType dtype, double value, Span span) { ICHECK_EQ(dtype.lanes(), 1) << "ValueError: FloatImm can only take scalar."; + + // check range for float32 and float16 since they have specified range. + if (!std::isinf(value) && !std::isnan(value)) { + if (dtype.bits() == 32) { + ICHECK_GE(value, std::numeric_limits::lowest()) + << "ValueError: Literal value " << value << " exceeds minimum of " << dtype; + ICHECK_LE(value, std::numeric_limits::max()) + << "ValueError: Literal value " << value << " exceeds maximum of " << dtype; + } else if (dtype.is_float16()) { + ICHECK_GE(value, -support::kMaxFloat16) + << "ValueError: Literal value " << value << " exceeds minimum of " << dtype; + ICHECK_LE(value, support::kMaxFloat16) + << "ValueError: Literal value " << value << " exceeds maximum of " << dtype; + } + } ObjectPtr node = make_object(); node->dtype = dtype; node->value = value; diff --git a/src/support/scalars.cc b/src/support/scalars.cc index 9caa7ca58915..0ab16899bae9 100644 --- a/src/support/scalars.cc +++ b/src/support/scalars.cc @@ -174,10 +174,6 @@ IntImm ValueToIntImm(int64_t value, int width) { } } -// 2^15 * (1 + 1023/1024) -// See https://en.wikipedia.org/wiki/Half-precision_floating-point_format -constexpr double kMaxFloat16 = 65504.0; - FloatImm ValueToFloatImm(double value, int width) { if (width == 16) { if (!std::isinf(value) && (value < -kMaxFloat16 || value > kMaxFloat16)) { diff --git a/src/support/scalars.h b/src/support/scalars.h index 60b8fc40a8de..2fdbb001d922 100644 --- a/src/support/scalars.h +++ b/src/support/scalars.h @@ -61,6 +61,10 @@ std::string FloatImmToString(const FloatImm& float_imm); IntImm ValueToIntImm(int64_t value, int width); FloatImm ValueToFloatImm(double value, int width); +// 2^15 * (1 + 1023/1024) +// See https://en.wikipedia.org/wiki/Half-precision_floating-point_format +constexpr double kMaxFloat16 = 65504.0; + } // namespace support } // namespace tvm diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py index 89de2f6a9520..a8eb7f406c37 100644 --- a/tests/python/relay/test_op_level4.py +++ b/tests/python/relay/test_op_level4.py @@ -512,7 +512,7 @@ def verify( # Test backwards slicing. verify((3, 4, 3), [-1, -1, -1], [-5, -5, -5], [-1, -1, -1], (3, 4, 3)) # Test slicing with overlarge indices. - verify((3, 4, 3), [0, 0, 0], [np.iinfo(np.int64).max] * 3, [1, 1, 1], (3, 4, 3)) + verify((3, 4, 3), [0, 0, 0], [np.iinfo(np.int32).max] * 3, [1, 1, 1], (3, 4, 3)) # Test slice mode. verify( (3, 4, 3), [1, 0, 0], [3, -1, 3], [1, 1, 1], (2, 4, 3), slice_mode="size", test_ref=False diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py index cacce5603e5f..fe662a30766c 100644 --- a/tests/python/relay/test_pass_fuse_ops.py +++ b/tests/python/relay/test_pass_fuse_ops.py @@ -777,7 +777,7 @@ def test_fuse_dynamic_squeeze_slice_take(): squeeze = relay.op.squeeze(x, axis=[0]) strided_slice = relay.op.strided_slice( - squeeze, begin=[0, 0], end=[15130, 9223372036854775807], strides=[1, 1] + squeeze, begin=[0, 0], end=[15130, 2147483647], strides=[1, 1] ) take = relay.op.take(strided_slice, take_val, axis=0) diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py index 82e1372f991e..c880f90ddffe 100644 --- a/tests/python/unittest/test_arith_rewrite_simplify.py +++ b/tests/python/unittest/test_arith_rewrite_simplify.py @@ -951,6 +951,8 @@ def test_cast_simplify(): ck.verify(tvm.tir.Cast(dtype1, x == x), tvm.tir.const(1, dtype1)) for dtype2 in dtypes: for i in [0, 1, 2, 3]: + if i > 1 and (dtype1 == "bool" or dtype2 == "bool"): + continue ck.verify(tvm.tir.Cast(dtype1, tvm.tir.const(i, dtype2)), tvm.tir.const(i, dtype1)) diff --git a/tests/python/unittest/test_target_codegen_cuda.py b/tests/python/unittest/test_target_codegen_cuda.py index 994a85095728..96b947e20655 100644 --- a/tests/python/unittest/test_target_codegen_cuda.py +++ b/tests/python/unittest/test_target_codegen_cuda.py @@ -1,4 +1,5 @@ # Licensed to the Apache Software Foundation (ASF) under one + # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file @@ -194,13 +195,13 @@ def check_cuda(n, value, lanes): fun(a) np.testing.assert_equal(a.numpy(), np_a) - check_cuda(64, 0xAB, 4) + check_cuda(64, np.int8(0xAB), 4) check_cuda(64, 0, 4) check_cuda(64, -3, 4) - check_cuda(64, 0xAB, 3) + check_cuda(64, np.int8(0xAB), 3) check_cuda(64, 0, 3) check_cuda(64, -3, 3) - check_cuda(64, 0xAB, 2) + check_cuda(64, np.int8(0xAB), 2) check_cuda(64, 0, 2) check_cuda(64, -3, 2) diff --git a/tests/python/unittest/test_tir_imm_values.py b/tests/python/unittest/test_tir_imm_values.py new file mode 100644 index 000000000000..a2a19a09ad87 --- /dev/null +++ b/tests/python/unittest/test_tir_imm_values.py @@ -0,0 +1,577 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import math +import random +import numpy as np +import tvm +import tvm.testing +import pytest +from tvm import tir +from tvm.script import tir as T +import pytest + + +@pytest.mark.parametrize( + "dtype, literals", + [ + ["int8", [-128, 0, 127]], + ["uint8", [0, 255]], + ["int32", [-2147483648, 2147483647]], + ["uint32", [0, 4294967295]], + ["int64", [-9223372036854775808, 9223372036854775807]], + ["uint64", [0, 9223372036854775807]], + ], +) +def test_tir_make_intimm(dtype, literals): + for l in literals: + imm = tir.const(l, dtype) + assert imm.value == l, imm + + +@pytest.mark.parametrize( + "dtype, literals", + [ + ["int8", [-129, 128]], + ["uint8", [-1, 256]], + ["int32", [-2147483650, 2147483648]], + ["uint32", [-1, 4294967296]], + ["uint64", [-1, 18446744073709551616]], + ], +) +def test_tir_invalid_intimm(dtype, literals): + for l in literals: + with pytest.raises(tvm.TVMError): + tir.const(l, dtype) + + +@pytest.mark.parametrize( + "dtype, literals", + [ + [ + "uint64", + { + 9223372036854775807: 9223372036854775807, + 18446744073709551615: 18446744073709551615, + }, + ], + ], +) +def test_tir_large_py_int_literals(dtype, literals): + """ + For large uint value, use LargeUIntImm intrin, + """ + for l in literals: + x = tir.const(l, dtype) + if isinstance(x, (tir.IntImm, tir.FloatImm)): + assert x.value == literals[l] + else: + # LargeUIntImm(low32, hi32) + assert (int(x.args[1]) << 32) + int(x.args[0]) == literals[l] + + +def test_tir_intimm_overflow(): + assert int(tir.const(255, "uint8") + tir.const(1, "uint8")) == 0 + assert int(tir.const(2**31 - 1, "int32") + tir.const(1, "int32")) == -(2**31) + assert int(tir.const(2**32 - 1, "uint32") + tir.const(1, "uint32")) == 0 + assert int(tir.const(2**63 - 1, "int64") + tir.const(1, "int64")) == -(2**63) + assert int(tir.const(2**32, "uint64") * tir.const(2**32, "uint64")) == 0 + # customized int types + assert int(tir.const(7, "int4") + tir.const(1, "int4")) == -8 + assert int(tir.const(2**39 - 1, "int40") + tir.const(1, "int40")) == -(2**39) + + +def compare_float_value(value, expect, msg): + if math.isfinite(value): + assert np.abs(value - expect) < 1e-5, f"{value} vs {expect}, {msg}" + elif math.isnan(value): + assert math.isnan(expect), f"{value} vs {expect}, {msg}" + elif math.isinf(value): + assert math.isinf(expect), f"{value} vs {expect}, {msg}" + + +@pytest.mark.parametrize( + "dtype, literals", + [ + ["float16", [-65504.0, 3.14, 65504.0, np.inf, np.nan]], + ["bfloat16", [-3.38953139e38, 3.38953139e38, 3.14]], + ["float32", [np.finfo("float32").min, 3.14, np.finfo("float32").max, np.inf, np.nan]], + ["float64", [np.finfo("float64").min, 3.14, np.finfo("float64").max, np.inf, np.nan]], + ], +) +def test_tir_make_floatimm(dtype, literals): + for l in literals: + imm = tir.const(l, dtype) + compare_float_value(imm.value, l, "imm value should match feed value") + + +@pytest.mark.parametrize( + "dtype, literals", + [ + ["float16", [-65505.0, 65505.0]], + ["float32", [-3.402e39, 3.402e39]], + ], +) +def test_tir_invalid_floatimm(dtype, literals): + """Currently only fp16 and fp32 have range check.""" + for l in literals: + with pytest.raises(tvm.TVMError): + tir.const(l, dtype) + + +@pytest.mark.parametrize("dtype", ["float16", "float32", "float64"]) +@pytest.mark.parametrize("literal", [3.14, np.nan, np.inf]) +def test_tir_special_floatimms(dtype, literal): + x = tir.const(literal, dtype) + compare_float_value(x.value, literal, "imm value should match feed value") + + +@tvm.testing.requires_llvm() +def test_tir_too_large_literal_f64(): + # Behavior check: if literal f64 value is out of dtype range, the + # object is still constructed, and eval to infinity. + @T.prim_func + def imm_overflow_fp64() -> T.float64: + T.evaluate(T.ret(T.float64(1.7976e309), dtype="float64")) + + f = tvm.build(imm_overflow_fp64, target="llvm") + assert math.isinf(f()) + + +@pytest.mark.parametrize( + "literal, expect_dtype", + [ + (256, "int32"), + (2147483647, "int32"), + (-2147483648, "int32"), + (2147483648, "int64"), + (-2147483649, "int64"), + (3.14159, "float32"), + (np.finfo("float32").min, "float32"), + (np.finfo("float32").max, "float32"), + (-3.402e39, "float64"), + (3.402e39, "float64"), + ], +) +def test_tir_const_auto_dtype(literal, expect_dtype): + x = tir.const(literal, dtype=None) + assert x.dtype == expect_dtype + assert x.value == literal + + +def check_tir_const_fold( + dtype, foldf, calcf, x_range=None, y_range=None, expect=None, skip_overflow=False +): + """Helper to check constant folding behavior + + Parameters + ---------- + dtype: str + Datatype of constants + + foldf: (x, y) -> z + Folding function to call + + calcf: (x, y) -> z + Compiled calculation function to call + + x_range: Union[int, float, tuple] + Single value or value range [min, max] + + y_range: Union[int, float, tuple] + Single value or value range [min, max] + + expect: Union[int, float] + Expected calculation result + + skip_overflow: bool + Skip assertion if the overflow happens + """ + seed = random.randint(0, 2147483648) + np.random.seed(seed) + ninfo = np.finfo(dtype) if dtype.startswith("float") else np.iinfo(dtype) + + if x_range is None: + x_range = (ninfo.min, ninfo.max) + if isinstance(x_range, (int, float)): + x = x_range + elif dtype.startswith("int") or dtype.startswith("uint"): + x = np.random.randint(x_range[0], x_range[1] + 1, dtype=dtype) + else: + x = np.random.uniform(x_range[0], x_range[1]) + + if y_range is None: + y_range = (ninfo.min, ninfo.max) + if isinstance(y_range, (int, float)): + y = y_range + elif dtype.startswith("int") or dtype.startswith("uint"): + y = np.random.randint(y_range[0], y_range[1] + 1, dtype=dtype) + else: + y = np.random.uniform(y_range[0], y_range[1]) + + if skip_overflow: + py_res = foldf(x, y) + if isinstance(py_res, (tir.IntImm, tir.FloatImm)): + py_res = py_res.value + if not (ninfo.min <= py_res <= ninfo.max): + # If the result overflow, certain arithmetics is non-defined + # thus we intentionally do not make the test failed. + return + + fold_res = foldf(tir.const(x, dtype), tir.const(y, dtype)) + calc_res = calcf(x, y) + + flaky_msg = ( + f"{dtype} ({x}, {y}, {expect}) const folding check failed.\n" + + "This test is intentionally non-deterministic, " + + f"if it fails please report it in github issue together with this seed {seed}\n" + ) + if dtype.startswith("float"): + compare_float_value(calc_res, fold_res.value, flaky_msg) + if expect: + compare_float_value(expect, calc_res, flaky_msg) + else: + assert calc_res == fold_res.value, flaky_msg + if expect: + assert expect == calc_res, flaky_msg + + +@tvm.testing.requires_llvm() +def test_tir_floatimm_const_fold(): + """Behavior check: folding fp32 match platform f32 arithmetic""" + + @T.prim_func + def float_imm_multiply(x: T.float32, y: T.float32, z: T.Buffer[(), "float32"]): + z[()] = x * y + + @T.prim_func + def float_imm_add(x: T.float32, y: T.float32, z: T.Buffer[(), "float32"]): + z[()] = x + y + + @T.prim_func + def float_imm_sub(x: T.float32, y: T.float32, z: T.Buffer[(), "float32"]): + z[()] = x - y + + @T.prim_func + def float_imm_div(x: T.float32, y: T.float32, z: T.Buffer[(), "float32"]): + z[()] = x / y + + def __wrap_build(f): + lib = tvm.build(f, target="llvm") + z = tvm.nd.array(np.zeros([]).astype("float32")) + + def _func(x, y): + lib(x, y, z) + return z.numpy() + + return _func + + fmul = __wrap_build(float_imm_multiply) + fadd = __wrap_build(float_imm_add) + fsub = __wrap_build(float_imm_sub) + fdiv = __wrap_build(float_imm_div) + + # overflow + check_tir_const_fold("float32", lambda x, y: x * y, fmul, 3.0e30, 3.0e30, np.inf) + check_tir_const_fold("float32", lambda x, y: x * y, fmul, 3.0e30, -3.0e30, -np.inf) + check_tir_const_fold("float32", lambda x, y: x / y, fdiv, 3.0e30, 3.0e-30, np.inf) + + # divide by zero + with pytest.raises(tvm.TVMError): + check_tir_const_fold("float32", lambda x, y: x / y, fdiv, 1.0, 0.0) + + # nan and inf + check_tir_const_fold("float32", lambda x, y: x + y, fadd, 1.0, np.nan, np.nan) + check_tir_const_fold("float32", lambda x, y: x + y, fadd, 1.0, np.inf, np.inf) + check_tir_const_fold("float32", lambda x, y: x + y, fadd, 1.0, -np.inf, -np.inf) + + # randomized check + check_tir_const_fold("float32", lambda x, y: x * y, fmul) + check_tir_const_fold("float32", lambda x, y: x + y, fadd) + check_tir_const_fold("float32", lambda x, y: x - y, fsub) + check_tir_const_fold( + "float32", lambda x, y: x / y, fdiv, y_range=(0.01, np.finfo("float32").max) + ) + + +@tvm.testing.requires_llvm() +def test_tir_int8_const_fold(): + """Behavior check: folding i8 operation match platform i8 arithmetic""" + + @T.prim_func + def imm_multiply(x: T.int8, y: T.int8) -> T.int8: + T.evaluate(T.ret(x * y, dtype="int8")) + + @T.prim_func + def imm_add(x: T.int8, y: T.int8) -> T.int8: + T.evaluate(T.ret(x + y, dtype="int8")) + + @T.prim_func + def imm_sub(x: T.int8, y: T.int8) -> T.int8: + T.evaluate(T.ret(x - y, dtype="int8")) + + @T.prim_func + def imm_truncdiv(x: T.int8, y: T.int8) -> T.int8: + T.evaluate(T.ret(T.truncdiv(x, y), dtype="int8")) + + @T.prim_func + def imm_floordiv(x: T.int8, y: T.int8) -> T.int8: + T.evaluate(T.ret(T.floordiv(x, y), dtype="int8")) + + fmul = tvm.build(imm_multiply, target="llvm") + fadd = tvm.build(imm_add, target="llvm") + fsub = tvm.build(imm_sub, target="llvm") + ffloordiv = tvm.build(imm_floordiv, target="llvm") + ftruncdiv = tvm.build(imm_truncdiv, target="llvm") + + # overflow + check_tir_const_fold("int8", lambda x, y: x + y, fadd, 127, 1, -128) + check_tir_const_fold("int8", lambda x, y: x * y, fmul, 127, 127, 1) + + # divide by zero + with pytest.raises(tvm.TVMError): + check_tir_const_fold("int8", lambda x, y: tir.floordiv(x, y), ffloordiv, 1, 0) + with pytest.raises(tvm.TVMError): + check_tir_const_fold("int8", lambda x, y: tir.truncdiv(x, y), ftruncdiv, 1, 0) + + # i8 mod folding is not implemented + assert not isinstance(tir.floormod(tir.const(7, "int8"), tir.const(3, "int8")), tir.IntImm) + assert not isinstance(tir.truncmod(tir.const(7, "int8"), tir.const(3, "int8")), tir.IntImm) + + # randomized check + check_tir_const_fold("int8", lambda x, y: x * y, fmul) + check_tir_const_fold("int8", lambda x, y: x + y, fadd) + check_tir_const_fold("int8", lambda x, y: x - y, fsub) + check_tir_const_fold( + "int8", lambda x, y: tir.floordiv(x, y), ffloordiv, y_range=(1, np.iinfo("int8").max) + ) + check_tir_const_fold( + "int8", lambda x, y: tir.truncdiv(x, y), ftruncdiv, y_range=(1, np.iinfo("int8").max) + ) + + +@tvm.testing.requires_llvm() +def test_tir_uint8_const_fold(): + """Behavior check: folding u8 operation match platform u8 arithmetic""" + + @T.prim_func + def imm_multiply(x: T.uint8, y: T.uint8) -> T.uint8: + T.evaluate(T.ret(x * y, dtype="uint8")) + + @T.prim_func + def imm_add(x: T.uint8, y: T.uint8) -> T.uint8: + T.evaluate(T.ret(x + y, dtype="uint8")) + + @T.prim_func + def imm_sub(x: T.uint8, y: T.uint8) -> T.uint8: + T.evaluate(T.ret(x - y, dtype="uint8")) + + @T.prim_func + def imm_truncdiv(x: T.uint8, y: T.uint8) -> T.uint8: + T.evaluate(T.ret(T.truncdiv(x, y), dtype="uint8")) + + @T.prim_func + def imm_floordiv(x: T.uint8, y: T.uint8) -> T.uint8: + T.evaluate(T.ret(T.floordiv(x, y), dtype="uint8")) + + fmul = tvm.build(imm_multiply, target="llvm") + fadd = tvm.build(imm_add, target="llvm") + fsub = tvm.build(imm_sub, target="llvm") + ffloordiv = tvm.build(imm_floordiv, target="llvm") + ftruncdiv = tvm.build(imm_truncdiv, target="llvm") + + # overflow + check_tir_const_fold("uint8", lambda x, y: x + y, fadd, 255, 1, 0) + + # zero sub + with pytest.raises(tvm.TVMError): + check_tir_const_fold("uint8", lambda x, y: x - y, fsub, 0, 10) + + # divide by zero + with pytest.raises(tvm.TVMError): + check_tir_const_fold("uint8", lambda x, y: tir.floordiv(x, y), ffloordiv, 1, 0) + with pytest.raises(tvm.TVMError): + check_tir_const_fold("uint8", lambda x, y: tir.truncdiv(x, y), ftruncdiv, 1, 0) + + # u8 mod folding is not implemented + assert not isinstance(tir.floormod(tir.const(7, "uint8"), tir.const(3, "uint8")), tir.IntImm) + assert not isinstance(tir.truncmod(tir.const(7, "uint8"), tir.const(3, "uint8")), tir.IntImm) + + # randomized check + check_tir_const_fold("uint8", lambda x, y: x * y, fmul) + check_tir_const_fold("uint8", lambda x, y: x + y, fadd) + check_tir_const_fold("uint8", lambda x, y: x - y, fsub) + check_tir_const_fold( + "uint8", lambda x, y: tir.floordiv(x, y), ffloordiv, y_range=(1, np.iinfo("uint8").max) + ) + check_tir_const_fold( + "uint8", lambda x, y: tir.truncdiv(x, y), ftruncdiv, y_range=(1, np.iinfo("uint8").max) + ) + + +@tvm.testing.requires_llvm() +def test_tir_int32_const_fold(): + """Behavior check: folding i32 operation match platform i32 arithmetic""" + + @T.prim_func + def imm_multiply(x: T.int32, y: T.int32) -> T.int32: + T.evaluate(T.ret(x * y, dtype="int32")) + + @T.prim_func + def imm_add(x: T.int32, y: T.int32) -> T.int32: + T.evaluate(T.ret(x + y, dtype="int32")) + + @T.prim_func + def imm_sub(x: T.int32, y: T.int32) -> T.int32: + T.evaluate(T.ret(x - y, dtype="int32")) + + @T.prim_func + def imm_truncdiv(x: T.int32, y: T.int32) -> T.int32: + T.evaluate(T.ret(T.truncdiv(x, y), dtype="int32")) + + @T.prim_func + def imm_truncmod(x: T.int32, y: T.int32) -> T.int32: + T.evaluate(T.ret(T.truncmod(x, y), dtype="int32")) + + @T.prim_func + def imm_floordiv(x: T.int32, y: T.int32) -> T.int32: + T.evaluate(T.ret(T.floordiv(x, y), dtype="int32")) + + @T.prim_func + def imm_floormod(x: T.int32, y: T.int32) -> T.int32: + T.evaluate(T.ret(T.floormod(x, y), dtype="int32")) + + fmul = tvm.build(imm_multiply, target="llvm") + fadd = tvm.build(imm_add, target="llvm") + fsub = tvm.build(imm_sub, target="llvm") + ffloordiv = tvm.build(imm_floordiv, target="llvm") + ffloormod = tvm.build(imm_floormod, target="llvm") + ftruncdiv = tvm.build(imm_truncdiv, target="llvm") + ftruncmod = tvm.build(imm_truncmod, target="llvm") + + # i32 overflow is not specified, only check for range + assert -(2**31) <= int(tir.const(2**31 - 1, "int32") + tir.const(1, "int32")) < 2**31 + assert -(2**31) <= int(tir.const(-(2**31), "int32") - tir.const(1, "int32")) < 2**31 + + # divide by zero + with pytest.raises(tvm.TVMError): + check_tir_const_fold("int32", lambda x, y: tir.floordiv(x, y), ffloordiv, 1, 0) + with pytest.raises(tvm.TVMError): + check_tir_const_fold("int32", lambda x, y: tir.floormod(x, y), ffloormod, 1, 0) + with pytest.raises(tvm.TVMError): + check_tir_const_fold("int32", lambda x, y: tir.truncdiv(x, y), ftruncdiv, 1, 0) + with pytest.raises(tvm.TVMError): + check_tir_const_fold("int32", lambda x, y: tir.truncmod(x, y), ftruncmod, 1, 0) + + # randomized check + check_tir_const_fold("int32", lambda x, y: x * y, fmul, skip_overflow=True) + check_tir_const_fold("int32", lambda x, y: x + y, fadd, skip_overflow=True) + check_tir_const_fold("int32", lambda x, y: x - y, fsub, skip_overflow=True) + check_tir_const_fold( + "int32", + lambda x, y: tir.floordiv(x, y), + ffloordiv, + y_range=(1, np.iinfo("int32").max), + skip_overflow=True, + ) + check_tir_const_fold( + "int32", + lambda x, y: tir.truncdiv(x, y), + ftruncdiv, + y_range=(1, np.iinfo("int32").max), + skip_overflow=True, + ) + check_tir_const_fold( + "int32", + lambda x, y: tir.floormod(x, y), + ffloormod, + y_range=(1, np.iinfo("int32").max), + skip_overflow=False, + ) + check_tir_const_fold( + "int32", + lambda x, y: tir.truncmod(x, y), + ftruncmod, + y_range=(1, np.iinfo("int32").max), + skip_overflow=False, + ) + + +@tvm.testing.requires_llvm() +def test_tir_uint32_const_fold(): + """Behavior check: folding u32 operation match platform u32 arithmetic""" + + @T.prim_func + def imm_multiply(x: T.uint32, y: T.uint32) -> T.uint32: + T.evaluate(T.ret(x * y, dtype="uint32")) + + @T.prim_func + def imm_add(x: T.uint32, y: T.uint32) -> T.uint32: + T.evaluate(T.ret(x + y, dtype="uint32")) + + @T.prim_func + def imm_sub(x: T.uint32, y: T.uint32) -> T.uint32: + T.evaluate(T.ret(x - y, dtype="uint32")) + + @T.prim_func + def imm_truncdiv(x: T.uint32, y: T.uint32) -> T.uint32: + T.evaluate(T.ret(T.truncdiv(x, y), dtype="uint32")) + + @T.prim_func + def imm_floordiv(x: T.uint32, y: T.uint32) -> T.uint32: + T.evaluate(T.ret(T.floordiv(x, y), dtype="uint32")) + + fmul = tvm.build(imm_multiply, target="llvm") + fadd = tvm.build(imm_add, target="llvm") + fsub = tvm.build(imm_sub, target="llvm") + ffloordiv = tvm.build(imm_floordiv, target="llvm") + ftruncdiv = tvm.build(imm_truncdiv, target="llvm") + + # u32 overflow is not specified, only check for range + assert 0 <= int(tir.const(2**32 - 1, "uint32") + tir.const(1, "uint32")) < 2**32 + + # divide by zero + with pytest.raises(tvm.TVMError): + check_tir_const_fold("uint32", lambda x, y: tir.floordiv(x, y), ffloordiv, 1, 0) + with pytest.raises(tvm.TVMError): + check_tir_const_fold("uint32", lambda x, y: tir.truncdiv(x, y), ftruncdiv, 1, 0) + + # u8 mod folding is not implemented + assert not isinstance(tir.floormod(tir.const(7, "uint32"), tir.const(3, "uint32")), tir.IntImm) + assert not isinstance(tir.truncmod(tir.const(7, "uint32"), tir.const(3, "uint32")), tir.IntImm) + + # randomized check + check_tir_const_fold("uint32", lambda x, y: x * y, fmul, skip_overflow=True) + check_tir_const_fold("uint32", lambda x, y: x + y, fadd, skip_overflow=True) + check_tir_const_fold("uint32", lambda x, y: x - y, fsub, skip_overflow=True) + check_tir_const_fold( + "uint32", + lambda x, y: tir.floordiv(x, y), + ffloordiv, + y_range=(1, np.iinfo("uint32").max), + skip_overflow=False, + ) + check_tir_const_fold( + "uint32", + lambda x, y: tir.truncdiv(x, y), + ftruncdiv, + y_range=(1, np.iinfo("uint32").max), + skip_overflow=False, + ) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/unittest/test_tir_transform_narrow_datatype.py b/tests/python/unittest/test_tir_transform_narrow_datatype.py index d66b4ef5dd5b..20818a5b326a 100644 --- a/tests/python/unittest/test_tir_transform_narrow_datatype.py +++ b/tests/python/unittest/test_tir_transform_narrow_datatype.py @@ -67,8 +67,6 @@ def check(m, n, target_bits, target_dtype): # const shape # i32 -> i32 check(2, 2, 32, "int32") - # i32 + i32 is not promoted to i64 even if overflow - check(2**16, 2**16, 32, "int32") # i64 -> i32 check(const(2, dtype="int64"), const(2, dtype="int64"), 32, "int32") check(const(2**16, dtype="int64"), const(2**16, dtype="int64"), 32, "int64") @@ -100,12 +98,6 @@ def check(m, n, target_bits, target_dtype): # i32 -> i32 check(2, 32, target_bits=32, target_dtype="int32") - check( - 2**30, - 32, # i32 + i32 is not promoted to i64 even in the case of overflow - target_bits=32, - target_dtype="int32", - ) # i64 -> i32 check(const(2, dtype="int64"), const(32, dtype="int64"), target_bits=32, target_dtype="int32") check( @@ -162,7 +154,6 @@ def check(m, lanes, target_bits, target_dtype): # i32 -> i32 check(const(2**10, dtype="int32"), 2, target_bits=32, target_dtype="int32") - check(const(2**32, dtype="int32"), 2, target_bits=32, target_dtype="int32") # i64 -> i32 check(const(2**10, dtype="int64"), 2, target_bits=32, target_dtype="int32") check(const(2**32, dtype="int64"), 2, target_bits=32, target_dtype="int64") From 4c05656c65e6ab73f398f3fa982f84d48b16b55d Mon Sep 17 00:00:00 2001 From: Tasmia Rahman <89925728+trahman-quic@users.noreply.github.com> Date: Fri, 9 Sep 2022 16:19:45 -0500 Subject: [PATCH 144/704] [TOPI][Hexagon] Add test and schedule for uint8 resize2d (#12559) * [TOPI][Hexagon] Add test and schedule for uint8 resize2d * Fix correctness issue * Reformat * Remove cubic from testing * Remove unnecessary else --- python/tvm/topi/hexagon/resize2d.py | 41 +++++++++++++-- .../test_hexagon/topi/test_resize2d.py | 52 +++++++++++++++---- 2 files changed, 80 insertions(+), 13 deletions(-) diff --git a/python/tvm/topi/hexagon/resize2d.py b/python/tvm/topi/hexagon/resize2d.py index ed544143b583..0e817e2e9330 100755 --- a/python/tvm/topi/hexagon/resize2d.py +++ b/python/tvm/topi/hexagon/resize2d.py @@ -58,24 +58,59 @@ def resize2d_compute( ) -def tir_broadcast_schedule( +def tir_resize2d_schedule( out_m, input_a, input_layout: str, output_layout: str, ): - """Schedule for input and output layout nhwc-8h2w32c2w-2d""" + """Schedule for input and output layout nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d""" func = te.create_prim_func([input_a, out_m]) s = tir.Schedule(func) block = s.get_block("resize") - if input_layout == "nhwc-8h2w32c2w-2d": + if input_layout in ( + "nhwc-8h2w32c2w-2d", + "nhwc-8h8w32c-2d", + ): input_transformed_layout = get_layout_transform_fn(input_layout) s.transform_layout(block, buffer=("read", 0), index_map=input_transformed_layout) output_transformed_layout = get_layout_transform_fn(output_layout) s.transform_layout(block, buffer=("write", 0), index_map=output_transformed_layout) + if output_layout == "nhwc-8h2w32c2w-2d": + # Fixed chunk size is 2048 byte + # For fp16 the layout for fixed chunk is 8x4x32 + # where each element is 2 bytes + # Split and reorder is done to iterate over the fixed chunk + # Channel is split by a factor of 32 + # Width is split by a factor of 4 + # Height is split by a factor of 8 + n, h, w, c = s.get_loops(block) + + ho, hi = s.split(h, [None, 8]) + wo, wi = s.split(w, [None, 4]) + co, ci = s.split(c, [None, 32]) + + s.reorder(n, ho, wo, co, hi, wi, ci) + + elif output_layout == "nhwc-8h8w32c-2d": + # Fixed chunk size is 2048 byte + # For uint8 the layout for fixed chunk is 8x8x32 + # where each element is 1 bytes + # Split and reorder is done to iterate over the fixed chunk + # Channel is split by a factor of 32 + # Width is split by a factor of 8 + # Height is split by a factor of 8 + n, h, w, c = s.get_loops(block) + + ho, hi = s.split(h, [None, 8]) + wo, wi = s.split(w, [None, 8]) + co, ci = s.split(c, [None, 32]) + + s.reorder(n, ho, wo, co, hi, wi, ci) + return s diff --git a/tests/python/contrib/test_hexagon/topi/test_resize2d.py b/tests/python/contrib/test_hexagon/topi/test_resize2d.py index d0c2c1464a95..1ef9f50977c5 100755 --- a/tests/python/contrib/test_hexagon/topi/test_resize2d.py +++ b/tests/python/contrib/test_hexagon/topi/test_resize2d.py @@ -26,26 +26,46 @@ @tvm.testing.fixture def expected_output_np( - input_np, in_height, in_width, out_height, out_width, layout, method, coord_trans + input_np, + in_height, + in_width, + out_height, + out_width, + layout, + method, + coord_trans, + dtype, ): scale_h = out_height / in_height scale_w = out_width / in_width + return resize2d_python(input_np, (scale_h, scale_w), layout, method, coord_trans) @tvm.testing.fixture def input_np(input_shape, dtype): - return np.random.random(input_shape).astype(dtype) + if dtype == "float16": + return np.random.random(input_shape).astype(dtype) + if dtype == "uint8": + return np.random.randint(0, 255, input_shape).astype(dtype) + if dtype == "int8": + return np.random.randint(-128, 127, input_shape).astype(dtype) @tvm.testing.fixture -def transformed_input_np(input_np, layout, input_crouton_layout): - return transform_numpy(input_np, layout.lower(), input_crouton_layout) +def transformed_input_np(input_np, layout, input_crouton_layout, dtype): + if dtype == "float16" or dtype == "uint8" or dtype == "int8": + return transform_numpy(input_np, layout.lower(), input_crouton_layout) + + raise RuntimeError(f"Unsupported data type '{dtype}'") @tvm.testing.fixture -def transformed_expected_output_np(expected_output_np, layout, output_layout): - return transform_numpy(expected_output_np, layout.lower(), output_layout) +def transformed_expected_output_np(expected_output_np, layout, output_layout, dtype): + if dtype == "float16" or dtype == "uint8" or dtype == "int8": + return transform_numpy(expected_output_np, layout.lower(), output_layout) + + raise RuntimeError(f"Unsupported data type '{dtype}'") @tvm.testing.fixture @@ -80,10 +100,11 @@ class TestResize2d: (layout, input_crouton_layout, output_layout, dtype,) = tvm.testing.parameters( ("NHWC", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"), + ("NHWC", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d", "uint8"), ) coord_trans = tvm.testing.parameter("asymmetric", "align_corners", "half_pixel") - method = tvm.testing.parameter("nearest_neighbor", "linear", "cubic") + method = tvm.testing.parameter("nearest_neighbor", "linear") @tvm.testing.requires_hexagon def test_resize2d( @@ -112,14 +133,18 @@ def test_resize2d( layout=layout, coordinate_transformation_mode=coord_trans, method=method, + out_dtype=dtype, ) - tir_schedule = s1.tir_broadcast_schedule(M, A, input_crouton_layout, output_layout) + tir_schedule = s1.tir_resize2d_schedule(M, A, input_crouton_layout, output_layout) sch = tir_schedule.mod input_axis_separator = [4] - if output_layout == "nhwc-8h2w32c2w-2d": + if output_layout in ( + "nhwc-8h2w32c2w-2d", + "nhwc-8h8w32c-2d", + ): output_axis_separator = [4] else: raise RuntimeError(f"Unexpected layout '{output_layout}'") @@ -155,8 +180,15 @@ def test_resize2d( # convert nd to np and reshape to fixed chunk size layout if output_layout == "nhwc-8h2w32c2w-2d": M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2]) + elif output_layout == "nhwc-8h8w32c-2d": + M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32]) - np.testing.assert_allclose(transformed_expected_output_np, M_data_np, rtol=1e-3, atol=1e-3) + if dtype == "float16": + np.testing.assert_allclose( + transformed_expected_output_np, M_data_np, rtol=1e-3, atol=1e-3 + ) + elif dtype == "int8" or dtype == "uint8": + np.testing.assert_allclose(transformed_expected_output_np, M_data_np, rtol=1, atol=1) if __name__ == "__main__": From 2eed6636436901f8a862304603d9b40d83432261 Mon Sep 17 00:00:00 2001 From: Tasmia Rahman <89925728+trahman-quic@users.noreply.github.com> Date: Fri, 9 Sep 2022 16:21:23 -0500 Subject: [PATCH 145/704] [TOPI][Hexagon] Implement quantized elementwise for hexagon (#12606) * [TOPI][Hexagon] Add test and schedule for uint8 resize2d * Fix correctness issue * Reformat * [TOPI][Hexagon] Implement quantized elementwise * Reformat * Address review comments * Reformat * Revert * Address review comments --- python/tvm/topi/hexagon/qnn/__init__.py | 2 +- python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py | 270 ++++++++++++++++++ .../topi/test_add_subtract_multiply.py | 217 ++++++++++++-- 3 files changed, 463 insertions(+), 26 deletions(-) create mode 100755 python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py index 25d1e6d1854d..ef9c025ba5b2 100644 --- a/python/tvm/topi/hexagon/qnn/__init__.py +++ b/python/tvm/topi/hexagon/qnn/__init__.py @@ -18,7 +18,7 @@ """ Computes and schedules for Hexagon quantized ops """ from .avg_pool2d import qnn_avg_pool2d_compute, qnn_avg_pool2d_schedule - +from .qadd_qsub_qmul import * from .dequantize import ( dequantize_compute, dequantize_schedule, diff --git a/python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py b/python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py new file mode 100755 index 000000000000..043ad313bdef --- /dev/null +++ b/python/tvm/topi/hexagon/qnn/qadd_qsub_qmul.py @@ -0,0 +1,270 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name + +"""Compute and schedule for quantized add, multiply, subtract op + +Please note the following assumptions made by the implementation: + +1) The inputs will be multiple of crouton layout except for the axis that needs broadcasting.""" + +from tvm import te +from tvm import tir +from ..utils import get_layout_transform_fn, get_fixed_point_value + + +def broadcast_axis(tensor_A, tensor_B): + """Find out the indices that will have broadcasting""" + A_broadcast = [] + B_broadcast = [] + + for i in range(len(tensor_A.shape)): + if tensor_A.shape[i] == tensor_B.shape[i]: + A_broadcast.append(1) + B_broadcast.append(1) + elif tensor_A.shape[i] == 1: + A_broadcast.append(0) + B_broadcast.append(1) + elif tensor_B.shape[i] == 1: + A_broadcast.append(1) + B_broadcast.append(0) + return A_broadcast, B_broadcast + + +def saturate(x: te.Tensor, dtype: str): + """Saturate value for the specified data type""" + return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype))) + + +def get_int_scale( + scale_A: float, + scale_B: float, + scale_M: float, + zero_point_A: int, + zero_point_B: int, + zero_point_M: int, + op: str, +): + """ + Get fixed-point number and exp_scale_factor from topi.hexagon.utils.get_fixed_point_value. + Also, depending on the op, this function uses exp_scale_factor(log2 of the scale factor) + to adjust the output's zero_point. + """ + + C_recip = 1 / scale_M + + if op == "qmul": + scale = scale_A * scale_B * C_recip + scale_fixed_point, rsh = get_fixed_point_value(scale, "int16") + + # We need to adjust output's zero point value since the compute for the op is multiplied + # by a scaling factor. + # The scaling factor is 2^x where x is the exp_scale_factor which is assigned to rsh here. + # Since zero_point_M is multipled by 2^rsh while converting floating-point scale value + # into fixed-point number, we left shift it by rsh in our compute to reflect that. + + corr = zero_point_M << rsh + + return scale_fixed_point, rsh, corr + + a_scale_f = scale_A * C_recip + b_scale_f = scale_B * C_recip + scale_fixed_point_a, rsh_a = get_fixed_point_value(a_scale_f, "int16") + scale_fixed_point_b, rsh_b = get_fixed_point_value(b_scale_f, "int16") + + # Here we have two exp_scale_factors rsh_a and rsh_b. + # To avoid complexity, we want to use a common exp_scale_factor and + # we want to use the lowest of the two. + + # Since, either of scale_fixed_point_a or scale_fixed_point_b has already been multiplied + # by 2^max(rsh_a, rsh_b) in topi.hexagon.utils.get_fixed_point_value, + # we want to undo that by right shifting that scale_fixed_point value + # by the difference of rsh_a and rsh_b. + + # This results into having a common exp_scale_factor for both scale_fixed_point_a + # and scale_fixed_point_b. + + # We also set rsh here which is used to adjust the zero_point_M and compute the corr value, + # computation of which comes from the original equation of the op's compute. + + if rsh_a > rsh_b: + scale_fixed_point_a = scale_fixed_point_a >> (rsh_a - rsh_b) + rsh = rsh_b + else: + scale_fixed_point_b = scale_fixed_point_b >> (rsh_b - rsh_a) + rsh = rsh_a + + if op == "qadd": + corr = (zero_point_M << rsh) - ( + zero_point_A * scale_fixed_point_a + zero_point_B * scale_fixed_point_b + ) + else: + corr = (zero_point_M << rsh) - ( + zero_point_A * scale_fixed_point_a - zero_point_B * scale_fixed_point_b + ) + + return scale_fixed_point_a, scale_fixed_point_b, rsh, corr + + +def qadd_broadcast_compute( + tensor_A: te.Tensor, + tensor_B: te.Tensor, + output_shape: list, + zero_point_A: int, + scale_A: float, + zero_point_B: int, + scale_B: float, + zero_point_M: int, + scale_M: float, + dtype: str, +): + """Compute quantized add with broadcasting""" + A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B) + n_a, h_a, w_a, c_a = A_broadcast + n_b, h_b, w_b, c_b = B_broadcast + + scale_a, scale_b, rsh, corr = get_int_scale( + scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qadd" + ) + + return te.compute( + output_shape, + lambda n, h, w, c: saturate( + ( + ( + (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] * scale_a) + + (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] * scale_b) + + corr + ) + >> rsh + ), + dtype, + ).astype(dtype), + ) + + +def qsubtract_broadcast_compute( + tensor_A: te.Tensor, + tensor_B: te.Tensor, + output_shape: list, + zero_point_A: int, + scale_A: float, + zero_point_B: int, + scale_B: float, + zero_point_M: int, + scale_M: float, + dtype: str, +): + """Compute quantized subtract with broadcasting""" + A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B) + n_a, h_a, w_a, c_a = A_broadcast + n_b, h_b, w_b, c_b = B_broadcast + + scale_a, scale_b, rsh, corr = get_int_scale( + scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qsub" + ) + + return te.compute( + output_shape, + lambda n, h, w, c: saturate( + ( + ( + (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] * scale_a) + - (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] * scale_b) + + corr + ) + >> rsh + ), + dtype, + ).astype(dtype), + ) + + +def qmultiply_broadcast_compute( + tensor_A: te.Tensor, + tensor_B: te.Tensor, + output_shape: list, + zero_point_A: int, + scale_A: float, + zero_point_B: int, + scale_B: float, + zero_point_M: int, + scale_M: float, + dtype: str, +): + """Compute quantized multiply with broadcasting""" + A_broadcast, B_broadcast = broadcast_axis(tensor_A, tensor_B) + n_a, h_a, w_a, c_a = A_broadcast + n_b, h_b, w_b, c_b = B_broadcast + + scale_int, rsh, corr = get_int_scale( + scale_A, scale_B, scale_M, zero_point_A, zero_point_B, zero_point_M, "qmul" + ) + + return te.compute( + output_shape, + lambda n, h, w, c: saturate( + ( + ( + scale_int + * (tensor_A[n * n_a, h * h_a, w * w_a, c * c_a] - zero_point_A) + * (tensor_B[n * n_b, h * h_b, w * w_b, c * c_b] - zero_point_B) + + corr + ) + >> rsh + ), + dtype, + ).astype(dtype), + ) + + +def tir_schedule_quant( + out_M: te.Tensor, + tensor_A: te.Tensor, + tensor_B: te.Tensor, + output_layout: str, + tensor_A_layout: str, + tensor_B_layout: str, +): + """Schedule for output layout nhwc-8h8w32c-2d""" + func = te.create_prim_func([tensor_A, tensor_B, out_M]) + + s = tir.Schedule(func) + + block = s.get_block("compute") + + if tensor_A_layout == "nhwc-8h8w32c-2d": + tensor_A_transformed_layout = get_layout_transform_fn(tensor_A_layout) + s.transform_layout(block, buffer=tensor_A.name, index_map=tensor_A_transformed_layout) + + if tensor_B_layout == "nhwc-8h8w32c-2d": + tensor_B_transformed_layout = get_layout_transform_fn(tensor_B_layout) + s.transform_layout(block, buffer=tensor_B.name, index_map=tensor_B_transformed_layout) + + output_transformed_layout = get_layout_transform_fn(output_layout) + s.transform_layout(block, buffer=out_M.name, index_map=output_transformed_layout) + + n, h, w, c = s.get_loops(block) + + h_o, h_i = s.split(h, [None, 8]) + w_o, w_i = s.split(w, [None, 8]) + c_o, c_i = s.split(c, [None, 32]) + wio, wii = s.split(w_i, [None, 4]) + + s.reorder(n, h_o, w_o, c_o, h_i, wio, wii, c_i) + + return s diff --git a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py index 606aa628d009..fe70745143a9 100755 --- a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py +++ b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py @@ -22,7 +22,8 @@ import tvm from tvm import te import tvm.topi.hexagon.slice_ops as sl -from ..infrastructure import allocate_hexagon_array, transform_numpy +import tvm.topi.hexagon.qnn as qn +from ..infrastructure import allocate_hexagon_array, transform_numpy, quantize_np @tvm.testing.fixture @@ -38,34 +39,77 @@ def expected_output_np(input_np_A, input_np_B, op_name): @tvm.testing.fixture def input_np_A(input_shape_A, dtype): + if dtype == "uint8" or dtype == "int8": + dtype = "float32" return np.random.random(input_shape_A).astype(dtype) @tvm.testing.fixture def input_np_B(input_shape_B, dtype): + if dtype == "uint8" or dtype == "int8": + dtype = "float32" return np.random.random(input_shape_B).astype(dtype) @tvm.testing.fixture -def transformed_input_np_A(input_np_A, input_A_layout): - return transform_numpy(input_np_A, "nhwc", input_A_layout) +def quantize_input_np_A(input_np_A, dtype): + if dtype == "uint8" or dtype == "int8": + global zero_point_A_val, scale_A_val + input_np_A_quantized, scale_A_val, zero_point_A_val = quantize_np(input_np_A, dtype) + return input_np_A_quantized @tvm.testing.fixture -def transformed_input_np_B(input_np_B, input_B_layout): - return transform_numpy(input_np_B, "nhwc", input_B_layout) +def quantize_input_np_B(input_np_B, dtype): + if dtype == "uint8" or dtype == "int8": + global zero_point_B_val, scale_B_val + input_np_B_quantized, scale_B_val, zero_point_B_val = quantize_np(input_np_B, dtype) + return input_np_B_quantized @tvm.testing.fixture -def transformed_expected_output_np(expected_output_np, output_layout): - return transform_numpy(expected_output_np, "nhwc", output_layout) +def transformed_input_np_A(input_np_A, quantize_input_np_A, input_A_layout, dtype): + if dtype == "float16": + return transform_numpy(input_np_A, "nhwc", input_A_layout) + if dtype == "uint8" or dtype == "int8": + return transform_numpy(quantize_input_np_A, "nhwc", input_A_layout) + + raise RuntimeError(f"Unsupported data type '{dtype}'") + + +@tvm.testing.fixture +def transformed_input_np_B(input_np_B, quantize_input_np_B, input_B_layout, dtype): + if dtype == "float16": + return transform_numpy(input_np_B, "nhwc", input_B_layout) + if dtype == "uint8" or dtype == "int8": + return transform_numpy(quantize_input_np_B, "nhwc", input_B_layout) + + raise RuntimeError(f"Unsupported data type '{dtype}'") + + +@tvm.testing.fixture +def transformed_expected_output_np(expected_output_np, output_layout, dtype): + if dtype == "float16": + return transform_numpy(expected_output_np, "nhwc", output_layout) + if dtype == "uint8" or dtype == "int8": + global zero_point_M_val, scale_M_val + out_ref_quantized, scale_M_val, zero_point_M_val = quantize_np(expected_output_np, dtype) + return transform_numpy(out_ref_quantized, "nhwc", output_layout) + + raise RuntimeError(f"Unsupported data type '{dtype}'") def hexagon_wrapper_allocation( - device, layout, axis_separators, tensor_shape=None, data=None, transformed_data=None, dtype=None + device, + layout, + axis_separators, + tensor_shape=None, + data_original=None, + transformed_data=None, + dtype=None, ): """Input layout can either be nhwc-8h2w32c2w-2d or nhwc""" - if layout == "nhwc-8h2w32c2w-2d": + if layout == "nhwc-8h2w32c2w-2d" or layout == "nhwc-8h8w32c-2d": data_nd = allocate_hexagon_array( device, tensor_shape=tensor_shape, @@ -77,7 +121,7 @@ def hexagon_wrapper_allocation( elif layout == "nhwc": data_nd = allocate_hexagon_array( device, - data=data, + data=data_original, ) return data_nd @@ -136,6 +180,86 @@ class TestAddSubtractMultiplyBroadcast2d: "nhwc-8h2w32c2w-2d", "float16", ), + # broadcast all axes in one input + ( + [1, 48, 56, 32], + [1, 1, 1, 1], + "nhwc-8h2w32c2w-2d", + "nhwc", + "nhwc-8h2w32c2w-2d", + "float16", + ), + ( + [1, 48, 32, 64], + [1, 48, 32, 64], + "nhwc-8h8w32c-2d", + "nhwc-8h8w32c-2d", + "nhwc-8h8w32c-2d", + "uint8", + ), + # broadcast axis 2 in one input + ( + [1, 48, 32, 64], + [1, 48, 1, 64], + "nhwc-8h8w32c-2d", + "nhwc", + "nhwc-8h8w32c-2d", + "uint8", + ), + # broadcast axis 1 in one input + ( + [1, 48, 32, 64], + [1, 1, 32, 64], + "nhwc-8h8w32c-2d", + "nhwc", + "nhwc-8h8w32c-2d", + "uint8", + ), + # broadcast axis 3 in one input + ( + [1, 8, 8, 32], + [1, 8, 8, 1], + "nhwc-8h8w32c-2d", + "nhwc", + "nhwc-8h8w32c-2d", + "uint8", + ), + # broadcast both inputs + ( + [1, 56, 1, 128], + [1, 1, 64, 1], + "nhwc", + "nhwc", + "nhwc-8h8w32c-2d", + "uint8", + ), + # broadcast both inputs + ( + [1, 48, 1, 1], + [1, 1, 32, 32], + "nhwc", + "nhwc", + "nhwc-8h8w32c-2d", + "uint8", + ), + # broadcast both inputs + ( + [1, 48, 1, 32], + [1, 1, 32, 1], + "nhwc", + "nhwc", + "nhwc-8h8w32c-2d", + "uint8", + ), + # broadcast all axes in one input + ( + [1, 48, 56, 32], + [1, 1, 1, 1], + "nhwc-8h8w32c-2d", + "nhwc", + "nhwc-8h8w32c-2d", + "uint8", + ), ) op_name = tvm.testing.parameter("add", "subtract", "multiply") @@ -148,6 +272,8 @@ def test_transform( input_shape_B, input_np_A, input_np_B, + quantize_input_np_A, + quantize_input_np_B, transformed_input_np_A, transformed_input_np_B, expected_output_np, @@ -158,23 +284,50 @@ def test_transform( input_B_layout, op_name, ): + output_shape = expected_output_np.shape target_hexagon = tvm.target.hexagon("v69") A = te.placeholder(input_shape_A, name="A", dtype=dtype) B = te.placeholder(input_shape_B, name="B", dtype=dtype) - if op_name == "add": - M = sl.add_broadcast_compute(A, B) - elif op_name == "subtract": - M = sl.subtract_broadcast_compute(A, B) - elif op_name == "multiply": - M = sl.multiply_broadcast_compute(A, B) - - tir_schedule = sl.tir_broadcast_schedule( - M, A, B, output_layout, input_A_layout, input_B_layout, op_name - ) + if dtype == "float16": + if op_name == "add": + M = sl.add_broadcast_compute(A, B) + elif op_name == "subtract": + M = sl.subtract_broadcast_compute(A, B) + elif op_name == "multiply": + M = sl.multiply_broadcast_compute(A, B) + tir_schedule = sl.tir_broadcast_schedule( + M, A, B, output_layout, input_A_layout, input_B_layout, op_name + ) + elif dtype == "uint8" or dtype == "int8": + args = [ + A, + B, + output_shape, + zero_point_A_val, + scale_A_val, + zero_point_B_val, + scale_B_val, + zero_point_M_val, + scale_M_val, + dtype, + ] + if op_name == "add": + M = qn.qadd_broadcast_compute(*args) + elif op_name == "subtract": + M = qn.qsubtract_broadcast_compute(*args) + elif op_name == "multiply": + M = qn.qmultiply_broadcast_compute(*args) + tir_schedule = qn.tir_schedule_quant( + M, A, B, output_layout, input_A_layout, input_B_layout + ) + sch = tir_schedule.mod input_axis_separator = [4] - if output_layout == "nhwc-8h2w32c2w-2d": + if output_layout in ( + "nhwc-8h2w32c2w-2d", + "nhwc-8h8w32c-2d", + ): output_axis_separator = [4] else: raise RuntimeError(f"Unexpected layout '{output_layout}'") @@ -187,19 +340,26 @@ def test_transform( name="slice_op_with_transform", ) - output_shape = expected_output_np.shape + if dtype == "float16": + in_data_np_A = input_np_A + in_data_np_B = input_np_B + elif dtype == "int8" or dtype == "uint8": + in_data_np_A = quantize_input_np_A + in_data_np_B = quantize_input_np_B + else: + raise RuntimeError(f"Unsupport dtype '{dtype}'") A_data_nd = hexagon_wrapper_allocation( hexagon_session.device, layout=input_A_layout, - data=input_np_A, + data_original=in_data_np_A, transformed_data=transformed_input_np_A, axis_separators=input_axis_separator, ) B_data_nd = hexagon_wrapper_allocation( hexagon_session.device, layout=input_B_layout, - data=input_np_B, + data_original=in_data_np_B, transformed_data=transformed_input_np_B, axis_separators=input_axis_separator, ) @@ -218,8 +378,15 @@ def test_transform( # convert nd to np and reshape to fixed chunk size layout if output_layout == "nhwc-8h2w32c2w-2d": M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2]) + elif output_layout == "nhwc-8h8w32c-2d": + M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32]) - np.testing.assert_allclose(transformed_expected_output_np, M_data_np, rtol=1e-3, atol=1e-3) + if dtype == "float16": + np.testing.assert_allclose( + transformed_expected_output_np, M_data_np, rtol=1e-3, atol=1e-3 + ) + elif dtype == "int8" or dtype == "uint8": + np.testing.assert_allclose(transformed_expected_output_np, M_data_np, rtol=1, atol=1) if __name__ == "__main__": From 76f91b42b96b7f3274509ed713a118c117ed2f65 Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Sat, 10 Sep 2022 10:58:45 +0100 Subject: [PATCH 146/704] [ETHOSN] Update driver stack version to 22.08 (#12650) Updates the driver stack used by the NPU to the latest released version (semantic version 3.1.0), while maintaining backwards compatibility for the previous version 22.05 (semantic 3.0.1) during the migration period. In addition, support for split is re-introduced as this is now supported in 22.08. Change-Id: I86bce3469f0b8ad52e66461ae055dec6717b3527 --- .../ubuntu_install_ethosn_driver_stack.sh | 2 +- python/tvm/relay/op/contrib/ethosn.py | 8 ++--- .../contrib/test_ethosn/test_networks.py | 33 +++++++++++++++---- .../python/contrib/test_ethosn/test_resize.py | 9 ----- .../python/contrib/test_ethosn/test_split.py | 15 +++++++-- .../contrib/test_ethosn/test_topologies.py | 18 ++++++---- 6 files changed, 57 insertions(+), 28 deletions(-) diff --git a/docker/install/ubuntu_install_ethosn_driver_stack.sh b/docker/install/ubuntu_install_ethosn_driver_stack.sh index 1f8373a839e9..1696b3230e2f 100755 --- a/docker/install/ubuntu_install_ethosn_driver_stack.sh +++ b/docker/install/ubuntu_install_ethosn_driver_stack.sh @@ -22,7 +22,7 @@ set -o pipefail repo_url="https://github.com/Arm-software/ethos-n-driver-stack" repo_dir="ethosn-driver" -repo_revision="22.05" +repo_revision="22.08" install_path="/opt/arm/$repo_dir" tmpdir=$(mktemp -d) diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py index 5129ed9ffaef..c8003c8da4d5 100644 --- a/python/tvm/relay/op/contrib/ethosn.py +++ b/python/tvm/relay/op/contrib/ethosn.py @@ -102,11 +102,11 @@ def partition_for_ethosn(mod, params=None, **opts): raise ValueError("When targeting Ethos(TM)-N78, -variant=n78 should be set.") api_version = ethosn_api_version() - expected_api_version = "3.0.1" - if api_version != LooseVersion(expected_api_version): + supported_api_versions = ["3.0.1", "3.1.0"] + if all(api_version != LooseVersion(exp_ver) for exp_ver in supported_api_versions): raise ValueError( f"Driver stack version {api_version} is unsupported. " - f"Please use version {expected_api_version}." + f"Please use version in {supported_api_versions}." ) if params: @@ -415,7 +415,7 @@ def split(expr): """Check if a split is supported by Ethos-N.""" if not ethosn_available(): return False - if ethosn_api_version() >= LooseVersion("3.0.1"): + if ethosn_api_version() == LooseVersion("3.0.1"): return False if not _ethosn.split(expr): return False diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py index b584a579b8be..75f3479a5a9c 100644 --- a/tests/python/contrib/test_ethosn/test_networks.py +++ b/tests/python/contrib/test_ethosn/test_networks.py @@ -14,7 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=wrong-import-position +# pylint: disable=wrong-import-position, wrong-import-order + """Arm(R) Ethos(TM)-N integration end-to-end network tests""" import pytest @@ -22,11 +23,16 @@ pytest.importorskip("tflite") pytest.importorskip("tensorflow") +from distutils.version import LooseVersion + import tflite.Model + from tvm import relay from tvm.testing import requires_ethosn from tvm.contrib import download +from tvm.relay.op.contrib.ethosn import ethosn_api_version import tvm.relay.testing.tf as tf_testing + from . import infrastructure as tei @@ -119,7 +125,10 @@ def test_mobilenet_v1(): # codegen, which could come about from either a change in Support Library # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. - _compile_hash = {"50186822915909303e813205db80e032"} + if ethosn_api_version() == LooseVersion("3.1.0"): + _compile_hash = {"c37fec1f214c7f93ce49ee4e3b587969"} + else: + _compile_hash = {"50186822915909303e813205db80e032"} _test_image_network( model_url="https://storage.googleapis.com/download.tensorflow.org/" "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz", @@ -141,7 +150,10 @@ def test_resnet_50_int8(): # codegen, which could come about from either a change in Support Library # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. - _compile_hash = {"9245965b2c01e7f3d9b478e38a186eb4", "4225fa951c145bb1e48e28cad6a3bdd4"} + if ethosn_api_version() == LooseVersion("3.1.0"): + _compile_hash = {"12d65aec33594c88b6d0d31dcd5144e6", "6a64d69ccb36dfb6b30dd2abdba4b005"} + else: + _compile_hash = {"9245965b2c01e7f3d9b478e38a186eb4", "4225fa951c145bb1e48e28cad6a3bdd4"} _test_image_network( model_url="https://raw.githubusercontent.com/dmlc/web-data/main/tensorflow/" "models/Quantized/resnet_50_quantized.tflite", @@ -162,7 +174,10 @@ def test_inception_v3(): # codegen, which could come about from either a change in Support Library # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. - _compile_hash = {"a5a2b5d2b618de754bf9a01033a020c0"} + if ethosn_api_version() == LooseVersion("3.1.0"): + _compile_hash = {"cff892eb15944756f22dad4b83c756d2"} + else: + _compile_hash = {"a5a2b5d2b618de754bf9a01033a020c0"} _test_image_network( model_url="https://storage.googleapis.com/download.tensorflow.org/" "models/tflite_11_05_08/inception_v3_quant.tgz", @@ -183,7 +198,10 @@ def test_inception_v4(): # codegen, which could come about from either a change in Support Library # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. - _compile_hash = {"61b4ade41898d7cb2451dbdc3340aced"} + if ethosn_api_version() == LooseVersion("3.1.0"): + _compile_hash = {"2eeae331898f8e94c74868e190077837"} + else: + _compile_hash = {"61b4ade41898d7cb2451dbdc3340aced"} _test_image_network( model_url="https://storage.googleapis.com/download.tensorflow.org/" "models/inception_v4_299_quant_20181026.tgz", @@ -204,7 +222,10 @@ def test_ssd_mobilenet_v1(): # codegen, which could come about from either a change in Support Library # version or a change in the Ethos-N codegen. To update this requires running # on hardware that isn't available in CI. - _compile_hash = {"789906c7d8ac787809b303d82781fc9d", "6b699f94795785d31b39940a5cf84a81"} + if ethosn_api_version() == LooseVersion("3.1.0"): + _compile_hash = {"ec2b78852192058f88b64d45c26620d5", "f68cbeaaba03874ea735ce3f5eab9227"} + else: + _compile_hash = {"789906c7d8ac787809b303d82781fc9d", "6b699f94795785d31b39940a5cf84a81"} _test_image_network( model_url="https://storage.googleapis.com/download.tensorflow.org/" "models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip", diff --git a/tests/python/contrib/test_ethosn/test_resize.py b/tests/python/contrib/test_ethosn/test_resize.py index b437ad1e545c..30b29fb1612e 100644 --- a/tests/python/contrib/test_ethosn/test_resize.py +++ b/tests/python/contrib/test_ethosn/test_resize.py @@ -108,19 +108,10 @@ def test_resize(dtype, shape, size, coordinate_transformation_mode, rounding_met (20, 30), "Requested width isn't supported", ), - ( - (19, 20), - "Requested width and height must be both even or both odd", - ), - ( - (20, 19), - "Requested width and height must be both even or both odd", - ), ], ) def test_resize_failure(size, err_msg): """Check Resize error messages.""" - dtype = "int8" zp_min = np.iinfo(dtype).min diff --git a/tests/python/contrib/test_ethosn/test_split.py b/tests/python/contrib/test_ethosn/test_split.py index afbc45a0805d..a6155065a54c 100644 --- a/tests/python/contrib/test_ethosn/test_split.py +++ b/tests/python/contrib/test_ethosn/test_split.py @@ -17,12 +17,15 @@ """Split tests for Arm(R) Ethos(TM)-N""" +from distutils.version import LooseVersion + import numpy as np import pytest import tvm from tvm import relay from tvm.testing import requires_ethosn +from tvm.relay.op.contrib.ethosn import ethosn_api_version from . import infrastructure as tei @@ -33,7 +36,6 @@ def _get_model(shape, dtype, splits, axis): return split.astuple() -@pytest.mark.skip("Split is not supported by the 3.0.1 version of the driver stack.") @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) @pytest.mark.parametrize( @@ -45,6 +47,11 @@ def _get_model(shape, dtype, splits, axis): ) def test_split(dtype, shape, splits, axis): """Compare Split output with TVM.""" + if ethosn_api_version() == LooseVersion("3.0.1"): + pytest.skip( + "Split is not supported by the 3.0.1 version of the driver stack.", + ) + np.random.seed(0) outputs = [] @@ -62,7 +69,6 @@ def test_split(dtype, shape, splits, axis): tei.verify(outputs, dtype, 0) -@pytest.mark.skip("Split is not supported by the 3.0.1 version of the driver stack.") @requires_ethosn @pytest.mark.parametrize( "shape,dtype,splits,axis,err_msg", @@ -83,6 +89,11 @@ def test_split(dtype, shape, splits, axis): ) def test_split_failure(shape, dtype, splits, axis, err_msg): """Check Split error messages.""" + if ethosn_api_version() == LooseVersion("3.0.1"): + pytest.skip( + "Split is not supported by the 3.0.1 version of the driver stack.", + ) + model = _get_model(shape, dtype, splits, axis) mod = tei.make_ethosn_partition(model) tei.test_error(mod, {}, err_msg) diff --git a/tests/python/contrib/test_ethosn/test_topologies.py b/tests/python/contrib/test_ethosn/test_topologies.py index dc6a2ed086d4..47a01154d0b2 100644 --- a/tests/python/contrib/test_ethosn/test_topologies.py +++ b/tests/python/contrib/test_ethosn/test_topologies.py @@ -17,13 +17,15 @@ """Arm(R) Ethos(TM)-N tests for complex network topologies.""" +from distutils.version import LooseVersion + import numpy as np import pytest import tvm from tvm import relay from tvm.testing import requires_ethosn -from tvm.relay.op.contrib.ethosn import Available, ethosn_available +from tvm.relay.op.contrib.ethosn import Available, ethosn_available, ethosn_api_version from . import infrastructure as tei @@ -78,8 +80,8 @@ def get_model(input_shape, dtype, var_names): model = get_model(inputs["a"].shape, dtype, iter(inputs)) mod = tei.make_module(model, []) - expected_host_ops = 1 - npu_partitions = 2 + expected_host_ops = 1 if ethosn_api_version() == LooseVersion("3.0.1") else 0 + npu_partitions = 2 if ethosn_api_version() == LooseVersion("3.0.1") else 1 # Mock inference is only supported when the whole graph is offloaded to the NPU if ethosn_available() == Available.SW_ONLY: @@ -280,8 +282,8 @@ def get_model(shape, dtype, splits, axis): model = get_model(shape, dtype, splits, axis) mod = tei.make_module(model, {}) - expected_host_ops = 1 - npu_partitions = 2 + expected_host_ops = 1 if ethosn_api_version() == LooseVersion("3.0.1") else 0 + npu_partitions = 2 if ethosn_api_version() == LooseVersion("3.0.1") else 1 # Mock inference is only supported when the whole graph is offloaded to the NPU if ethosn_available() == Available.SW_ONLY: @@ -309,13 +311,17 @@ def get_model(shape, dtype, splits, axis): tei.verify(outputs, dtype, 0) -@pytest.mark.skip("Split is not supported by the 3.0.1 version of the driver stack.") @requires_ethosn @pytest.mark.parametrize("dtype", ["uint8", "int8"]) def test_output_tuple_propagation(dtype): """This tests the case where the output tuple must be inferred as having dummy tensor information.""" + if ethosn_api_version() == LooseVersion("3.0.1"): + pytest.skip( + "Split is not supported by the 3.0.1 version of the driver stack.", + ) + def get_model(dtype): a = relay.var("a", shape=(1, 4, 4, 16), dtype=dtype) split = relay.op.split(a, indices_or_sections=4, axis=2) From 286fadecb8d536940b89669e699d757399dad755 Mon Sep 17 00:00:00 2001 From: Alexander Pivovarov Date: Mon, 12 Sep 2022 00:01:10 -0700 Subject: [PATCH 147/704] [TF] Add Bincount support (#12751) --- python/tvm/relay/frontend/tensorflow_ops.py | 41 ++++++++++++- .../frontend/tensorflow/test_forward.py | 35 +++++++++++ .../tensorflow2/test_functional_models.py | 60 +++++++++++++++++++ 3 files changed, 135 insertions(+), 1 deletion(-) diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py index 4598f4f09a05..66bb858edbf0 100644 --- a/python/tvm/relay/frontend/tensorflow_ops.py +++ b/python/tvm/relay/frontend/tensorflow_ops.py @@ -2868,11 +2868,49 @@ def _impl(inputs, attr, params, mod): return _impl +def _bincount(): + def _impl(inputs, attr, params, mod): + input = inputs[0] # arr: int32 Tensor + size = inputs[1] # size: non-negative int scalar Tensor + # weights: int32, int64, float32, or float64 Tensor with the same shape as arr + # or a length-0 Tensor, in which case it acts as all weights equal to 1. + weights = inputs[2] + # Returns: Output: 1D Tensor with length equal to size + # The counts or summed weights for each value in the range [0, size). + + input_shape = _infer_shape(input, mod) + if len(input_shape) > 1: + input = _op.reshape(input, [-1]) + + is_weights_zero_tensor = True + if weights: + weights_shape = _infer_shape(weights, mod) + is_weights_zero_tensor = weights_shape == (0,) + if len(weights_shape) > 1: + weights = _op.reshape(weights, [-1]) + + # Output should have the same dtype as weights. + if is_weights_zero_tensor: + # if weights are length-0 Tensor - output dtype is float32 + out_dtype = "float32" + updates = _op.cast(_op.ones_like(input), out_dtype) + else: + out_dtype = _infer_type(weights, mod).checked_type.dtype + updates = weights + + counts_shape = _op.reshape(size, [1]) + counts = _op.zeros(counts_shape, out_dtype) + out = _op.scatter_add(counts, input, updates, axis=0) + return out + + return _impl + + def _dense_bincount(): def _impl(inputs, attr, params, mod): input = inputs[0] # input: int32, int64. 1D or 2D int Tensor size = inputs[1] # size: non-negative int scalar Tensor - # weights: int32, int64, float32, or float64 Tensor with the same shape as arr + # weights: int32, int64, float32, or float64 Tensor with the same shape as input # or a length-0 Tensor, in which case it acts as all weights equal to 1. weights = inputs[2] # Returns: Output: 1D Tensor with length equal to size @@ -2951,6 +2989,7 @@ def _impl(inputs, attr, params, mod): "BatchNormWithGlobalNormalization": _batch_norm(), "BatchToSpaceND": _batch_to_space_nd(), "BiasAdd": _bias_add(), + "Bincount": _bincount(), "BroadcastTo": _broadcast_to(), "BroadcastArgs": _broadcast_args(), "Cast": _cast(), diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py index ebeb35e08f5d..8ed6d9108e5d 100755 --- a/tests/python/frontend/tensorflow/test_forward.py +++ b/tests/python/frontend/tensorflow/test_forward.py @@ -5758,6 +5758,41 @@ def test_invert_permutation(): compare_tf_with_tvm(x, "Placeholder:0", out_name, no_gpu=False) +####################################################################### +# Bincount +# ---- + + +def _test_bincount(in_shape, size, weights): + with tf.Graph().as_default(): + inputs = [] + data = [] + inputs.append(tf.placeholder(shape=in_shape, dtype="int32", name="input0")) + data.append(np.random.uniform(0, size, size=in_shape).astype("int32")) + inputs.append(tf.placeholder(shape=(), dtype="int32", name="size")) + data.append(np.array(size, "int32")) + if weights: + inputs.append(tf.placeholder(shape=in_shape, dtype="float32", name="weights")) + data.append(np.reshape(weights, in_shape).astype("float32")) + else: + inputs.append(tf.placeholder(shape=(0,), dtype="float32", name="weights")) + data.append(np.array([], "float32")) + result = tf.raw_ops.Bincount(arr=data[0], size=data[1], weights=data[2]) + compare_tf_with_tvm(data, [a.name for a in inputs], result.name, mode="vm") + + +def test_forward_bincount(): + """Test Bincount Op""" + # 2D input + _test_bincount((3, 10), 20, [1.0] * 30) + _test_bincount((3, 10), 20, [1.5] * 30) + _test_bincount((3, 10), 20, None) + # 1D input + _test_bincount((10,), 20, [1.0] * 10) + _test_bincount((10,), 20, [1.5] * 10) + _test_bincount((10,), 20, None) + + ####################################################################### # DenseBincount # ---- diff --git a/tests/python/frontend/tensorflow2/test_functional_models.py b/tests/python/frontend/tensorflow2/test_functional_models.py index 001ba6de1967..42ad5b29af79 100644 --- a/tests/python/frontend/tensorflow2/test_functional_models.py +++ b/tests/python/frontend/tensorflow2/test_functional_models.py @@ -585,5 +585,65 @@ def func(self, x): run_test((-1, -1)) +def test_bincount_1d(): + def run_test(weights, minlength, maxlength, axis, binary_output): + class Bincount1D(tf.Module): + def get_input(self): + return np.random.uniform(low=0, high=maxlength, size=(100,)).astype("int32") + + @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.int32)]) + def func(self, x): + return tf.math.bincount( + x, + weights=weights, + minlength=minlength, + maxlength=maxlength, + axis=axis, + binary_output=binary_output, + ) + + run_model_graph(Bincount1D) + run_func_graph(Bincount1D, runtime="vm") + + for axis in [None, 0, -1]: + run_test(weights=None, minlength=20, maxlength=20, axis=axis, binary_output=False) + run_test(weights=None, minlength=20, maxlength=20, axis=axis, binary_output=True) + + # weights and axis=None need operator UnsortedSegmentSum to be implemented. Skip axis=None + weights = np.random.uniform(low=0.2, high=5, size=(100,)).astype("float32") + for axis in [0, -1]: + run_test(weights=weights, minlength=20, maxlength=20, axis=axis, binary_output=False) + + +def test_bincount_2d(): + def run_test(weights, minlength, maxlength, axis, binary_output): + class Bincount2D(tf.Module): + def get_input(self): + return np.random.uniform(low=0, high=maxlength, size=(3, 100)).astype("int32") + + @tf.function(input_signature=[tf.TensorSpec([None, None], tf.int32)]) + def func(self, x): + return tf.math.bincount( + x, + weights=weights, + minlength=minlength, + maxlength=maxlength, + axis=axis, + binary_output=binary_output, + ) + + run_model_graph(Bincount2D) + run_func_graph(Bincount2D, runtime="vm") + + for axis in [None, 0, -1]: + run_test(weights=None, minlength=20, maxlength=20, axis=axis, binary_output=False) + run_test(weights=None, minlength=20, maxlength=20, axis=axis, binary_output=True) + + # weights and axis=None need operator UnsortedSegmentSum to be implemented. Skip axis=None + weights = np.random.uniform(low=0.2, high=5, size=(3, 100)).astype("float32") + for axis in [0, -1]: + run_test(weights=weights, minlength=20, maxlength=20, axis=axis, binary_output=False) + + if __name__ == "__main__": pytest.main([__file__]) From 4c863fc115ee463284f20b5ee37c973ac0ed5d9a Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Mon, 12 Sep 2022 01:00:29 -0700 Subject: [PATCH 148/704] [TVMScript] Base IRBuilder methods for `Block` (#12748) This PR introduces base IRBuilder methods for `Block`. Co-authored-by: yongwww --- include/tvm/script/ir_builder/tir/frame.h | 70 +++++++++++++++++++ include/tvm/script/ir_builder/tir/ir.h | 8 +++ python/tvm/script/ir_builder/tir/frame.py | 5 ++ python/tvm/script/ir_builder/tir/ir.py | 20 ++++++ src/script/ir_builder/tir/frame.cc | 24 +++++++ src/script/ir_builder/tir/ir.cc | 17 +++++ src/script/ir_builder/tir/utils.h | 9 +++ .../unittest/test_tvmscript_ir_builder_tir.py | 27 +++++++ 8 files changed, 180 insertions(+) diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h index 4bfd022af27a..15ab77863e5e 100644 --- a/include/tvm/script/ir_builder/tir/frame.h +++ b/include/tvm/script/ir_builder/tir/frame.h @@ -117,6 +117,76 @@ class PrimFuncFrame : public TIRFrame { TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(PrimFuncFrame, TIRFrame, PrimFuncFrameNode); }; +/*! + * \brief A frame that represents the block. + * + * \sa BlockFrame + */ +class BlockFrameNode : public TIRFrameNode { + public: + /*! \brief The name of the block. */ + String name; + /*! \brief The variables of the block. */ + Array iter_vars; + /*! \brief The read buffer regions of the block. */ + Optional> reads; + /*! \brief The write buffer regions of the block. */ + Optional> writes; + /*! \brief The init statement of the bolck. */ + Optional init; + /*! \brief The buffer allocated in the block. */ + Array alloc_buffers; + /*! \brief The match buffer regions. */ + Array match_buffers; + /*! \brief The annotation of the block. */ + Optional> annotations; + /*! \brief The corresponding values of the iter vars. */ + Array iter_values; + /*! + * \brief The predicate of the block realization, the block will only be executed when the + * predicate is true. + */ + Optional predicate; + /*! \brief The flag whether to construct BlockRealize or Block. */ + bool no_realize; + + void VisitAttrs(tvm::AttrVisitor* v) { + TIRFrameNode::VisitAttrs(v); + v->Visit("name", &name); + v->Visit("iter_vars", &iter_vars); + v->Visit("reads", &reads); + v->Visit("writes", &writes); + v->Visit("init", &init); + v->Visit("alloc_buffers", &alloc_buffers); + v->Visit("match_buffers", &match_buffers); + v->Visit("annotations", &annotations); + v->Visit("iter_values", &iter_values); + v->Visit("predicate", &predicate); + v->Visit("no_realize", &no_realize); + } + + static constexpr const char* _type_key = "script.ir_builder.tir.BlockFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(BlockFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to BlockFrameNode. + * + * \sa BlockFrameNode + */ + +class BlockFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(BlockFrame, TIRFrame, BlockFrameNode); +}; + /*! * \brief A frame that represents the assert statement. Proceeds if the condition is true, * otherwise aborts with the message. diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h index cee60ad4f827..615ce90383dd 100644 --- a/include/tvm/script/ir_builder/tir/ir.h +++ b/include/tvm/script/ir_builder/tir/ir.h @@ -34,6 +34,14 @@ namespace tir { */ PrimFuncFrame PrimFunc(); +/*! + * \brief The block declaration statement. + * \param name The name of the block. + * \param no_realize The flag whether to construct BlockRealize or Block. + * \return The BlockFrame. + */ +BlockFrame Block(String name, bool no_realize = false); + /*! * \brief Evaluate the input expression. * \param value The input expression to evaluate. diff --git a/python/tvm/script/ir_builder/tir/frame.py b/python/tvm/script/ir_builder/tir/frame.py index 61418e0b2aa6..0e7eb2bb4720 100644 --- a/python/tvm/script/ir_builder/tir/frame.py +++ b/python/tvm/script/ir_builder/tir/frame.py @@ -29,3 +29,8 @@ class TIRFrame(IRBuilderFrame): @_register_object("script.ir_builder.tir.PrimFuncFrame") class PrimFuncFrame(TIRFrame): ... + + +@_register_object("script.ir_builder.tir.BlockFrame") +class BlockFrame(TIRFrame): + ... diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py index ae5d5b260f65..7ba2f6df9418 100644 --- a/python/tvm/script/ir_builder/tir/ir.py +++ b/python/tvm/script/ir_builder/tir/ir.py @@ -33,6 +33,25 @@ def prim_func() -> frame.PrimFuncFrame: return _ffi_api.PrimFunc() # pylint: disable=no-member # type: ignore +def block(name: str = "", no_realize: bool = False) -> frame.BlockFrame: + """The block declaration statement. + + Parameters + ---------- + name : str + The name of the block. + + no_realize : bool + The flag whether to construct BlockRealize or Block. + + Returns + ------- + res : frame.BlockFrame + The BlockFrame. + """ + return _ffi_api.Block(name, no_realize) # pylint: disable=no-member # type: ignore + + def evaluate(value: PrimExpr) -> None: """Evaluate the input expression. @@ -50,6 +69,7 @@ def evaluate(value: PrimExpr) -> None: __all__ = [ + "block", "evaluate", "prim_func", ] diff --git a/src/script/ir_builder/tir/frame.cc b/src/script/ir_builder/tir/frame.cc index 139c8193b0ba..dd3097e388b7 100644 --- a/src/script/ir_builder/tir/frame.cc +++ b/src/script/ir_builder/tir/frame.cc @@ -50,8 +50,32 @@ void PrimFuncFrameNode::ExitWithScope() { } } +void BlockFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + Array tir_alloc_buffers; + for (const tvm::tir::Buffer& buffer : alloc_buffers) { + tir_alloc_buffers.push_back(buffer); + } + Map attrs = annotations.value_or({}); + if (int detect_access = (!reads.defined()) | (!writes.defined() << 1)) { + attrs.Set("tir.script_parsing_detect_access", tvm::IntImm(DataType::Int(64), detect_access)); + } + tvm::tir::Block block(iter_vars, reads.value_or(Array()), + writes.value_or(Array()), name, AsStmt(stmts), init, + tir_alloc_buffers, match_buffers, attrs); + if (no_realize) { + CHECK(iter_values.empty()) + << "ValueError: Block bindings are not allowed when `no_realize=True`"; + CHECK(!predicate.defined()) << "ValueError: `T.where` is not allowed when `no_realize=True`"; + AddToParent(block); + } else { + AddToParent(tvm::tir::BlockRealize(iter_values, predicate.value_or(Bool(true)), block)); + } +} + TVM_REGISTER_NODE_TYPE(TIRFrameNode); TVM_REGISTER_NODE_TYPE(PrimFuncFrameNode); +TVM_REGISTER_NODE_TYPE(BlockFrameNode); } // namespace tir } // namespace ir_builder diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc index 5f994d71ca0a..4c2679ae6b56 100644 --- a/src/script/ir_builder/tir/ir.cc +++ b/src/script/ir_builder/tir/ir.cc @@ -41,8 +41,25 @@ PrimFuncFrame PrimFunc() { return PrimFuncFrame(n); } +BlockFrame Block(String name, bool no_realize) { + ObjectPtr n = make_object(); + n->name = name; + n->iter_vars.clear(); + n->reads = NullOpt; + n->writes = NullOpt; + n->init = NullOpt; + n->alloc_buffers.clear(); + n->match_buffers.clear(); + n->annotations = NullOpt; + n->iter_values.clear(); + n->predicate = NullOpt; + n->no_realize = no_realize; + return BlockFrame(n); +} + void Evaluate(PrimExpr value) { AddToParent(tvm::tir::Evaluate(value)); } TVM_REGISTER_GLOBAL("script.ir_builder.tir.PrimFunc").set_body_typed(PrimFunc); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Block").set_body_typed(Block); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate); } // namespace tir } // namespace ir_builder diff --git a/src/script/ir_builder/tir/utils.h b/src/script/ir_builder/tir/utils.h index 47557917cca5..4f8b3f77c6e1 100644 --- a/src/script/ir_builder/tir/utils.h +++ b/src/script/ir_builder/tir/utils.h @@ -60,6 +60,15 @@ inline PrimFuncFrame FindPrimFuncFrame(const String& method) { throw; } +inline BlockFrame FindBlockFrame(const String& method) { + if (Optional frame = IRBuilder::Current()->GetLastFrame()) { + return frame.value(); + } + LOG(FATAL) << "ValueError: Block frame not find. Please ensure '" << method + << "' is called under T.block()"; + throw; +} + } // namespace tir } // namespace ir_builder } // namespace script diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py index 70a8f3565d03..85080c7c65fc 100644 --- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py +++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py @@ -45,5 +45,32 @@ def test_ir_builder_tir_primfunc(): assert_structural_equal(prim_func_actual, prim_func_expected, map_free_vars=True) +def test_ir_builder_tir_block(): + with IRBuilder() as ib: + with T.block("block"): + T.evaluate(0) + # the block generated by IRBuilder + block_realize_actual = ib.get() + + # the expected block + block_expected = tir.Block( + iter_vars=[], + reads=[], + writes=[], + name_hint="block", + body=tir.Evaluate(0), + alloc_buffers=None, + match_buffers=None, + annotations={"tir.script_parsing_detect_access": tir.IntImm("int64", 3)}, + ) + block_realize_expected = tir.BlockRealize( + iter_values=[], + predicate=True, + block=block_expected, + ) + # Check if the generated ir is expected + assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True) + + if __name__ == "__main__": tvm.testing.main() From a63d03a116e6b8a3a80b96a90519a96ca63e16b9 Mon Sep 17 00:00:00 2001 From: wrongtest Date: Mon, 12 Sep 2022 16:07:00 +0800 Subject: [PATCH 149/704] [MetaSchedule] Fix typo of compare between GlobalVar and str (#12704) fix typo of compare between GlobalVar and str --- python/tvm/meta_schedule/default_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py index 652f09261b2f..ac4028ec50f8 100644 --- a/python/tvm/meta_schedule/default_config.py +++ b/python/tvm/meta_schedule/default_config.py @@ -53,7 +53,7 @@ def mod(mod: Union[PrimFunc, IRModule]) -> IRModule: # pylint: disable=redefine raise TypeError(f"Expected `mod` to be PrimFunc or IRModule, but gets: {mod}") func_names = mod.get_global_vars() (func_name,) = func_names - if len(func_names) == 1 and func_name != "main": + if len(func_names) == 1 and func_name.name_hint != "main": mod = IRModule({"main": mod[func_name]}) return mod From a047e0228a3f7015e56c6756cdadb13444008623 Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Mon, 12 Sep 2022 12:23:44 -0400 Subject: [PATCH 150/704] [CI] Always install into a python venv in ci containers (#12663) This PR changes all ci_ to install TVM Python dependencies in a virtualenv separate from the system Python dependencies. Sets the stage for adding the poetry-based dependency generator to the CI container build process. * Always install into a python venv in ci containers. * Respect Dockerfile ENV PATH modifications in docker/bash.sh lookups. --- docker/Dockerfile.ci_arm | 11 +- docker/Dockerfile.ci_cortexm | 15 +- docker/Dockerfile.ci_cpu | 11 +- docker/Dockerfile.ci_gpu | 12 +- docker/Dockerfile.ci_hexagon | 11 +- docker/Dockerfile.ci_i386 | 11 +- docker/Dockerfile.ci_lint | 11 +- docker/Dockerfile.ci_minimal | 11 +- docker/Dockerfile.ci_riscv | 15 +- docker/Dockerfile.ci_wasm | 11 +- docker/install/ubuntu1804_install_python.sh | 45 ---- .../install/ubuntu1804_install_python_venv.sh | 30 --- docker/install/ubuntu_install_python.sh | 79 ++++-- docker/python/bootstrap-requirements.txt | 82 ++++++ docker/python/bootstrap/.gitignore | 1 + docker/python/bootstrap/generate.sh | 100 +++++++ .../bootstrap/lockfiles/constraints-3.7.txt | 254 ++++++++++++++++++ .../bootstrap/lockfiles/constraints-3.8.txt | 251 +++++++++++++++++ .../bootstrap/lockfiles/requirements-3.7.txt | 3 + .../bootstrap/lockfiles/requirements-3.8.txt | 3 + docker/python/ci-constraints.txt | 39 +++ docker/with_the_same_user | 26 +- 22 files changed, 875 insertions(+), 157 deletions(-) delete mode 100755 docker/install/ubuntu1804_install_python.sh delete mode 100755 docker/install/ubuntu1804_install_python_venv.sh create mode 100644 docker/python/bootstrap-requirements.txt create mode 100644 docker/python/bootstrap/.gitignore create mode 100755 docker/python/bootstrap/generate.sh create mode 100644 docker/python/bootstrap/lockfiles/constraints-3.7.txt create mode 100644 docker/python/bootstrap/lockfiles/constraints-3.8.txt create mode 100644 docker/python/bootstrap/lockfiles/requirements-3.7.txt create mode 100644 docker/python/bootstrap/lockfiles/requirements-3.8.txt create mode 100644 docker/python/ci-constraints.txt diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm index 90fbef4d441a..932687f1e568 100644 --- a/docker/Dockerfile.ci_arm +++ b/docker/Dockerfile.ci_arm @@ -47,11 +47,12 @@ ENV PATH /opt/sccache:$PATH COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh RUN bash /install/ubuntu_install_llvm.sh -COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh -RUN bash /install/ubuntu1804_install_python.sh - -# Globally disable pip cache -RUN pip config set global.no-cache-dir false +ENV TVM_VENV /venv/apache-tvm-py3.7 +COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles +COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh +RUN bash /install/ubuntu_install_python.sh +ENV PATH ${TVM_VENV}/bin:$PATH +ENV PYTHONNOUSERSITE 1 # Disable .local directory from affecting CI. COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh RUN bash /install/ubuntu_install_cmake_source.sh diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm index d646704bb0a8..6ca2f2f40b75 100644 --- a/docker/Dockerfile.ci_cortexm +++ b/docker/Dockerfile.ci_cortexm @@ -29,18 +29,15 @@ RUN bash /install/ubuntu_install_core.sh COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh RUN bash /install/ubuntu_install_googletest.sh -COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh -RUN bash /install/ubuntu1804_install_python.sh - COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh RUN bash /install/ubuntu_install_cmake_source.sh 3.20.0 -COPY install/ubuntu1804_install_python_venv.sh /install/ubuntu1804_install_python_venv.sh -RUN bash /install/ubuntu1804_install_python_venv.sh -ENV PATH=/opt/tvm-venv/bin:/opt/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin:$PATH - -# Globally disable pip cache -RUN pip config set global.no-cache-dir false +ENV TVM_VENV /venv/apache-tvm-py3.7 +COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles +COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh +RUN bash /install/ubuntu_install_python.sh +ENV PATH ${TVM_VENV}/bin:$PATH +ENV PYTHONNOUSERSITE 1 # Disable .local directory from affecting CI. COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh RUN bash /install/ubuntu_install_python_package.sh diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu index d9f353d41be1..00fd9a4fcab3 100644 --- a/docker/Dockerfile.ci_cpu +++ b/docker/Dockerfile.ci_cpu @@ -28,11 +28,12 @@ RUN bash /install/ubuntu_install_core.sh COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh RUN bash /install/ubuntu_install_googletest.sh -COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh -RUN bash /install/ubuntu1804_install_python.sh - -# Globally disable pip cache -RUN pip config set global.no-cache-dir false +ENV TVM_VENV /venv/apache-tvm-py3.7 +COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles +COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh +RUN bash /install/ubuntu_install_python.sh +ENV PATH ${TVM_VENV}/bin:$PATH +ENV PYTHONNOUSERSITE 1 # Disable .local directory from affecting CI. COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh RUN bash /install/ubuntu_install_python_package.sh diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu index 6f02ab97c09e..4b729a5f516e 100644 --- a/docker/Dockerfile.ci_gpu +++ b/docker/Dockerfile.ci_gpu @@ -38,13 +38,15 @@ RUN bash /install/ubuntu_install_cmake_source.sh COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh RUN bash /install/ubuntu_install_googletest.sh -COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh -RUN bash /install/ubuntu1804_install_python.sh - -# Globally disable pip cache -RUN pip config set global.no-cache-dir false +ENV TVM_VENV /venv/apache-tvm-py3.7 +COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles +COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh +RUN bash /install/ubuntu_install_python.sh +ENV PATH ${TVM_VENV}/bin:$PATH +ENV PYTHONNOUSERSITE 1 # Disable .local directory from affecting CI. COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh +# Globally disable pip cache RUN bash /install/ubuntu_install_cmake_source.sh COPY install/ubuntu1804_install_llvm.sh /install/ubuntu1804_install_llvm.sh diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon index 66b78ae0800c..d2ed29278488 100644 --- a/docker/Dockerfile.ci_hexagon +++ b/docker/Dockerfile.ci_hexagon @@ -31,11 +31,12 @@ RUN bash /install/ubuntu_install_core.sh COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh RUN bash /install/ubuntu_install_googletest.sh -COPY install/ubuntu2004_install_python.sh /install/ubuntu2004_install_python.sh -RUN bash /install/ubuntu2004_install_python.sh - -# Globally disable pip cache -RUN pip config set global.cache-dir false +ENV TVM_VENV /venv/apache-tvm-py3.8 +COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles +COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh +RUN bash /install/ubuntu_install_python.sh +ENV PATH ${TVM_VENV}/bin:$PATH +ENV PYTHONNOUSERSITE 1 # Disable .local directory from affecting CI. # Rust env (build early; takes a while) COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386 index 0b6d8d28c4d7..dc767ff6def1 100644 --- a/docker/Dockerfile.ci_i386 +++ b/docker/Dockerfile.ci_i386 @@ -35,18 +35,19 @@ RUN bash /install/ubuntu_install_googletest.sh COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh RUN bash /install/ubuntu_install_llvm.sh -COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh -RUN bash /install/ubuntu1804_install_python.sh - # Rust env (build early; takes a while) COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh RUN bash /install/ubuntu_install_rust.sh ENV RUSTUP_HOME /opt/rust ENV CARGO_HOME /opt/rust ENV PATH $PATH:$CARGO_HOME/bin +ENV PYTHONNOUSERSITE 1 # Disable .local directory from affecting CI. -# Globally disable pip cache -RUN pip config set global.no-cache-dir false +ENV TVM_VENV /venv/apache-tvm-py3.7 +COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles +COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh +RUN bash /install/ubuntu_install_python.sh +ENV PATH ${TVM_VENV}/bin:$PATH COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh RUN bash /install/ubuntu_install_cmake_source.sh diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint index 437ea71bd4be..860a43fa2194 100644 --- a/docker/Dockerfile.ci_lint +++ b/docker/Dockerfile.ci_lint @@ -26,11 +26,12 @@ RUN apt-get update --fix-missing RUN apt-install-and-clear -y wget git sudo make parallel -COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh -RUN bash /install/ubuntu1804_install_python.sh - -# Globally disable pip cache -RUN pip config set global.no-cache-dir false +ENV TVM_VENV /venv/apache-tvm-py3.7 +COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles +COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh +RUN bash /install/ubuntu_install_python.sh +ENV PATH ${TVM_VENV}/bin:$PATH +ENV PYTHONNOUSERSITE 1 # Disable .local directory from affecting CI. RUN apt-get update && apt-install-and-clear -y doxygen graphviz curl shellcheck diff --git a/docker/Dockerfile.ci_minimal b/docker/Dockerfile.ci_minimal index cf548989eba2..974f3eea11d6 100644 --- a/docker/Dockerfile.ci_minimal +++ b/docker/Dockerfile.ci_minimal @@ -28,11 +28,12 @@ RUN bash /install/ubuntu_install_core.sh COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh RUN bash /install/ubuntu_install_googletest.sh -COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh -RUN bash /install/ubuntu1804_install_python.sh - -# Globally disable pip cache -RUN pip config set global.no-cache-dir false +ENV TVM_VENV /venv/apache-tvm-py3.7 +COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles +COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh +RUN bash /install/ubuntu_install_python.sh +ENV PATH ${TVM_VENV}/bin:$PATH +ENV PYTHONNOUSERSITE 1 # Disable .local directory from affecting CI. COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh RUN bash /install/ubuntu_install_python_package.sh diff --git a/docker/Dockerfile.ci_riscv b/docker/Dockerfile.ci_riscv index 1ca792e20c98..9b956d55ddaa 100644 --- a/docker/Dockerfile.ci_riscv +++ b/docker/Dockerfile.ci_riscv @@ -29,18 +29,15 @@ RUN bash /install/ubuntu_install_core.sh COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh RUN bash /install/ubuntu_install_googletest.sh -COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh -RUN bash /install/ubuntu1804_install_python.sh - COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh RUN bash /install/ubuntu_install_cmake_source.sh -COPY install/ubuntu1804_install_python_venv.sh /install/ubuntu1804_install_python_venv.sh -RUN bash /install/ubuntu1804_install_python_venv.sh -ENV PATH=/opt/tvm-venv/bin:/opt/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin:$PATH - -# Globally disable pip cache -RUN pip config set global.no-cache-dir false +ENV TVM_VENV /venv/apache-tvm-py3.7 +COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles +COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh +RUN bash /install/ubuntu_install_python.sh +ENV PATH ${TVM_VENV}/bin:$PATH +ENV PYTHONNOUSERSITE 1 # Disable .local directory from affecting CI. COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh RUN bash /install/ubuntu_install_python_package.sh diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm index 49435b4f3d47..17230312f041 100644 --- a/docker/Dockerfile.ci_wasm +++ b/docker/Dockerfile.ci_wasm @@ -26,11 +26,12 @@ RUN bash /install/ubuntu_install_core.sh COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh RUN bash /install/ubuntu_install_googletest.sh -COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh -RUN bash /install/ubuntu1804_install_python.sh - -# Globally disable pip cache -RUN pip config set global.no-cache-dir false +ENV TVM_VENV /venv/apache-tvm-py3.7 +COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles +COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh +RUN bash /install/ubuntu_install_python.sh +ENV PATH ${TVM_VENV}/bin:$PATH +ENV PYTHONNOUSERSITE 1 # Disable .local directory from affecting CI. COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh RUN bash /install/ubuntu_install_python_package.sh diff --git a/docker/install/ubuntu1804_install_python.sh b/docker/install/ubuntu1804_install_python.sh deleted file mode 100755 index 2cdddbd451a6..000000000000 --- a/docker/install/ubuntu1804_install_python.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e -set -u -set -o pipefail - - -cleanup() { - rm -rf base-requirements.txt -} - -trap cleanup 0 - - -# Install python and pip. Don't modify this to add Python package dependencies, -# instead modify install_python_package.sh -apt-get update -apt-install-and-clear -y software-properties-common python3.7 python3.7-dev python3-pip -update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1 - -# Pin pip and setuptools versions -# Hashes generated via: -# $ pip download == -# $ pip hash --algorithm sha512 .whl -cat < base-requirements.txt -pip==19.3.1 --hash=sha256:6917c65fc3769ecdc61405d3dfd97afdedd75808d200b2838d7d961cebc0c2c7 -setuptools==58.4.0 --hash=sha256:e8b1d3127a0441fb99a130bcc3c2bf256c2d3ead3aba8fd400e5cbbaf788e036 -EOF -pip3 install -r base-requirements.txt diff --git a/docker/install/ubuntu1804_install_python_venv.sh b/docker/install/ubuntu1804_install_python_venv.sh deleted file mode 100755 index 3f0fb3ee8971..000000000000 --- a/docker/install/ubuntu1804_install_python_venv.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e -set -u -set -o pipefail - -# install python and pip, don't modify this, modify install_python_package.sh -apt-get update -apt-install-and-clear -y software-properties-common python3.7-dev python3-setuptools python3.7-venv - -python3 -mvenv /opt/tvm-venv - -# Pin pip and setuptools versions -/opt/tvm-venv/bin/pip3 install pip==19.3.1 setuptools==58.4.0 diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh index ec50682c1454..66a80e1fdc52 100755 --- a/docker/install/ubuntu_install_python.sh +++ b/docker/install/ubuntu_install_python.sh @@ -18,28 +18,77 @@ set -e set -u -# Used for debugging RVM build -set -x set -o pipefail -# install python and pip, don't modify this, modify install_python_package.sh +set -x + +if [ -z "${TVM_VENV+x}" ]; then + echo "ERROR: expect TVM_VENV env var to be set" + exit 2 +fi + apt-get update -apt-install-and-clear -y python-dev -# python 3.6 +# Ensure lsb-release is installed. +apt-install-and-clear -y \ + lsb-core + +release=$(lsb_release -sc) +if [ "${release}" == "bionic" ]; then + PYTHON_VERSION=3.7 +elif [ "${release}" == "focal" ]; then + PYTHON_VERSION=3.8 +else + echo "Don't know which version of python to install for lsb-release ${release}" + exit 2 +fi + +# Install python and pip. Don't modify this to add Python package dependencies, +# instead modify install_python_package.sh apt-install-and-clear -y software-properties-common +apt-install-and-clear -y \ + acl \ + python${PYTHON_VERSION} \ + python${PYTHON_VERSION}-dev \ + python3-pip \ + python${PYTHON_VERSION}-venv -add-apt-repository -y ppa:deadsnakes/ppa -apt-get update -apt-install-and-clear -y python-pip python-dev python3.6 python3.6-dev +update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 + +# Allow disabling user site-packages, even with sudo; this makes it harder to repro CI failures +# locally because it's hard to tell what might be in this directory. +echo "Defaults env_keep += \"PYTHONNOUSERSITE\"" >/etc/sudoers.d/91-preserve-python-nousersite +export PYTHONNOUSERSITE=1 + +venv_dir="$(python3 -c "import os.path;print(os.path.dirname(\"${TVM_VENV}\"))")" +mkdir -p "${venv_dir}" +python3 -mvenv "${TVM_VENV}" +. "${TVM_VENV}/bin/activate" + +# Update pip to match version used to produce requirements-hashed.txt. This step +# is necessary so that pip's dependency solver is recent. +pip_spec=$(cat /install/python/bootstrap/lockfiles/constraints-${PYTHON_VERSION}.txt | grep 'pip==') +pip3 install -U --require-hashes -r <(echo "${pip_spec}") \ + -c /install/python/bootstrap/lockfiles/constraints-${PYTHON_VERSION}.txt -rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3 +# Python configuration +pip3 config set global.no-cache-dir true # Never cache packages -# python 3.7 -apt-install-and-clear -y python3.7 +# Now install the remaining base packages. +pip3 install \ + --require-hashes \ + -r /install/python/bootstrap/lockfiles/constraints-${PYTHON_VERSION}.txt -# Install pip -wget -q https://bootstrap.pypa.io/get-pip.py && python3.7 get-pip.py +addgroup tvm-venv +chgrp -R tvm-venv "${TVM_VENV}" +setfacl -R -d -m group:tvm-venv:rwx "${TVM_VENV}" -# Pin pip and setuptools versions -pip3 install pip==19.3.1 setuptools==58.4.0 +# Prevent further use of pip3 via the system. +# There may be multiple (i.e. from python3-pip apt package and pip3 install -U). +deactivate +while [ "$(which pip3)" != "" ]; do + rm "$(which pip3)" +done +while [ "$(which pip)" != "" ]; do + rm "$(which pip)" +done diff --git a/docker/python/bootstrap-requirements.txt b/docker/python/bootstrap-requirements.txt new file mode 100644 index 000000000000..5c036b8ed97d --- /dev/null +++ b/docker/python/bootstrap-requirements.txt @@ -0,0 +1,82 @@ +CacheControl==0.12.11 \ + --hash=sha256:2c75d6a8938cb1933c75c50184549ad42728a27e9f6b92fd677c3151aa72555b +SecretStorage==3.3.2 \ + --hash=sha256:755dc845b6ad76dcbcbc07ea3da75ae54bb1ea529eb72d15f83d26499a5df319 +cachy==0.3.0 \ + --hash=sha256:338ca09c8860e76b275aff52374330efedc4d5a5e45dc1c5b539c1ead0786fe7 +certifi==2022.5.18.1 \ + --hash=sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a +cffi==1.15.0 \ + --hash=sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997 +charset-normalizer==2.0.12 \ + --hash=sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df +cleo==0.8.1 \ + --hash=sha256:141cda6dc94a92343be626bb87a0b6c86ae291dfc732a57bf04310d4b4201753 +clikit==0.6.2 \ + --hash=sha256:71268e074e68082306e23d7369a7b99f824a0ef926e55ba2665e911f7208489e +crashtest==0.3.1 \ + --hash=sha256:300f4b0825f57688b47b6d70c6a31de33512eb2fa1ac614f780939aa0cf91680 +cryptography==37.0.2 \ + --hash=sha256:0cc20f655157d4cfc7bada909dc5cc228211b075ba8407c46467f63597c78178 +distlib==0.3.4 \ + --hash=sha256:6564fe0a8f51e734df6333d08b8b94d4ea8ee6b99b5ed50613f731fd4089f34b +filelock==3.7.0 \ + --hash=sha256:c7b5fdb219b398a5b28c8e4c1893ef5f98ece6a38c6ab2c22e26ec161556fed6 +html5lib==1.1 \ + --hash=sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d +idna==3.3 \ + --hash=sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff +importlib-metadata==1.7.0 \ + --hash=sha256:dc15b2969b4ce36305c51eebe62d418ac7791e9a157911d58bfb1f9ccd8e2070 +jeepney==0.8.0 \ + --hash=sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755 +keyring==22.3.0 \ + --hash=sha256:2bc8363ebdd63886126a012057a85c8cb6e143877afa02619ac7dbc9f38a207b +lockfile==0.12.2 \ + --hash=sha256:6c3cb24f344923d30b2785d5ad75182c8ea7ac1b6171b08657258ec7429d50fa +msgpack==1.0.3 \ + --hash=sha256:9c0903bd93cbd34653dd63bbfcb99d7539c372795201f39d16fdfde4418de43a +packaging==20.9 \ + --hash=sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a +pastel==0.2.1 \ + --hash=sha256:4349225fcdf6c2bb34d483e523475de5bb04a5c10ef711263452cb37d7dd4364 +pexpect==4.8.0 \ + --hash=sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937 +pip==22.1.1 \ + --hash=sha256:e7bcf0b2cbdec2af84cc1b7b79b25fdbd7228fbdb61a4dca0b82810d0ba9d18b +pkginfo==1.8.2 \ + --hash=sha256:c24c487c6a7f72c66e816ab1796b96ac6c3d14d49338293d2141664330b55ffc +platformdirs==2.5.2 \ + --hash=sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788 +poetry==1.1.13 \ + --hash=sha256:52deb0792a2e801967ba9c4cdb39b56fe68b0b5cd3f195b004bef603db9d51a7 +poetry-core==1.0.8 \ + --hash=sha256:54b0fab6f7b313886e547a52f8bf52b8cf43e65b2633c65117f8755289061924 +ptyprocess==0.7.0 \ + --hash=sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 +pycparser==2.21 \ + --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 +pylev==1.4.0 \ + --hash=sha256:7b2e2aa7b00e05bb3f7650eb506fc89f474f70493271a35c242d9a92188ad3dd +pyparsing==3.0.9 \ + --hash=sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc +requests==2.27.1 \ + --hash=sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d +requests-toolbelt==0.9.1 \ + --hash=sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f +setuptools==62.3.2 \ + --hash=sha256:68e45d17c9281ba25dc0104eadd2647172b3472d9e01f911efa57965e8d51a36 +shellingham==1.4.0 \ + --hash=sha256:536b67a0697f2e4af32ab176c00a50ac2899c5a05e0d8e2dadac8e58888283f9 +six==1.16.0 \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 +tomlkit==0.10.2 \ + --hash=sha256:905cf92c2111ef80d355708f47ac24ad1b6fc2adc5107455940088c9bbecaedb +urllib3==1.26.9 \ + --hash=sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14 +virtualenv==20.14.1 \ + --hash=sha256:e617f16e25b42eb4f6e74096b9c9e37713cf10bf30168fb4a739f3fa8f898a3a +webencodings==0.5.1 \ + --hash=sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78 +zipp==3.8.0 \ + --hash=sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099 diff --git a/docker/python/bootstrap/.gitignore b/docker/python/bootstrap/.gitignore new file mode 100644 index 000000000000..3d2dbd4b6317 --- /dev/null +++ b/docker/python/bootstrap/.gitignore @@ -0,0 +1 @@ +/_venv diff --git a/docker/python/bootstrap/generate.sh b/docker/python/bootstrap/generate.sh new file mode 100755 index 000000000000..116b8d8daee0 --- /dev/null +++ b/docker/python/bootstrap/generate.sh @@ -0,0 +1,100 @@ +#!/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -euo pipefail + +set -x + +cd "$(dirname "$0")" + +rm -rf build lockfiles +mkdir build +mkdir lockfiles + +function lock() { + mkdir -p build/$1 + cat >build/$1/pyproject.toml <= "3.7" and python_version < "4.0" \ + --hash=sha256:2c75d6a8938cb1933c75c50184549ad42728a27e9f6b92fd677c3151aa72555b \ + --hash=sha256:a5b9fcc986b184db101aa280b42ecdcdfc524892596f606858e0b7a8b4d9e144 +cachy==0.3.0; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.4.0" \ + --hash=sha256:338ca09c8860e76b275aff52374330efedc4d5a5e45dc1c5b539c1ead0786fe7 \ + --hash=sha256:186581f4ceb42a0bbe040c407da73c14092379b1e4c0e327fdb72ae4a9b269b1 +certifi==2022.6.15; python_version >= "3.7" and python_version < "4" \ + --hash=sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412 \ + --hash=sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d +cffi==1.15.1; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \ + --hash=sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2 \ + --hash=sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2 \ + --hash=sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914 \ + --hash=sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3 \ + --hash=sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e \ + --hash=sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162 \ + --hash=sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b \ + --hash=sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21 \ + --hash=sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185 \ + --hash=sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd \ + --hash=sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc \ + --hash=sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f \ + --hash=sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e \ + --hash=sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4 \ + --hash=sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01 \ + --hash=sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e \ + --hash=sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2 \ + --hash=sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d \ + --hash=sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac \ + --hash=sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83 \ + --hash=sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9 \ + --hash=sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c \ + --hash=sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325 \ + --hash=sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c \ + --hash=sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef \ + --hash=sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8 \ + --hash=sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d \ + --hash=sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104 \ + --hash=sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7 \ + --hash=sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6 \ + --hash=sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d \ + --hash=sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a \ + --hash=sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405 \ + --hash=sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e \ + --hash=sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf \ + --hash=sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497 \ + --hash=sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375 \ + --hash=sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e \ + --hash=sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82 \ + --hash=sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b \ + --hash=sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c \ + --hash=sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426 \ + --hash=sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9 \ + --hash=sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045 \ + --hash=sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3 \ + --hash=sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a \ + --hash=sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5 \ + --hash=sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca \ + --hash=sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02 \ + --hash=sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192 \ + --hash=sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314 \ + --hash=sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5 \ + --hash=sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585 \ + --hash=sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0 \ + --hash=sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415 \ + --hash=sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d \ + --hash=sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984 \ + --hash=sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35 \ + --hash=sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27 \ + --hash=sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76 \ + --hash=sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3 \ + --hash=sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee \ + --hash=sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c \ + --hash=sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9 +charset-normalizer==2.1.0; python_version >= "3.7" and python_version < "4" and python_full_version >= "3.6.0" \ + --hash=sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413 \ + --hash=sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5 +cleo==1.0.0a5; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:ff53056589300976e960f75afb792dfbfc9c78dcbb5a448e207a17b643826360 \ + --hash=sha256:097c9d0e0332fd53cc89fc11eb0a6ba0309e6a3933c08f7b38558555486925d3 +crashtest==0.3.1; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:300f4b0825f57688b47b6d70c6a31de33512eb2fa1ac614f780939aa0cf91680 \ + --hash=sha256:42ca7b6ce88b6c7433e2ce47ea884e91ec93104a4b754998be498a8e6c3d37dd +cryptography==37.0.3; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \ + --hash=sha256:d10413d493e98075060d3e62e5826de372912ea653ccc948f3c41b21ddca087f \ + --hash=sha256:cd64147ff16506632893ceb2569624b48c84daa3ba4d89695f7c7bc24188eee9 \ + --hash=sha256:17c74f7d9e9e9bb7e84521243695c1b4bdc3a0e44ca764e6bcf8f05f3de3d0df \ + --hash=sha256:0713bee6c8077786c56bdec9c5d3f099d40d2c862ff3200416f6862e9dd63156 \ + --hash=sha256:b9c2008417741cdfbe945ef2d16b7b7ba0790886a0b49e1de533acf93eb66ed6 \ + --hash=sha256:646905ff7a712e415bf0d0f214e0eb669dd2257c4d7a27db1e8baec5d2a1d55f \ + --hash=sha256:dcafadb5a06cb7a6bb49fb4c1de7414ee2f8c8e12b047606d97c3175d690f582 \ + --hash=sha256:0b4bfc5ccfe4e5c7de535670680398fed4a0bbc5dfd52b3a295baad42230abdf \ + --hash=sha256:a03dbc0d8ce8c1146c177cd0e3a66ea106f36733fb1b997ea4d051f8a68539ff \ + --hash=sha256:190a24c14e91c1fa3101069aac7e77d11c5a73911c3904128367f52946bbb6fd \ + --hash=sha256:b05c5478524deb7a019e240f2a970040c4b0f01f58f0425e6262c96b126c6a3e \ + --hash=sha256:891ed8312840fd43e0696468a6520a582a033c0109f7b14b96067bfe1123226b \ + --hash=sha256:30d6aabf623a01affc7c0824936c3dde6590076b61f5dd299df3cc2c75fc5915 \ + --hash=sha256:31a7c1f1c2551f013d4294d06e22848e2ccd77825f0987cba3239df6ebf7b020 \ + --hash=sha256:a94fd1ff80001cb97add71d07f596d8b865b716f25ef501183e0e199390e50d3 \ + --hash=sha256:8a85dbcc770256918b40c2f40bd3ffd3b2ae45b0cf19068b561db8f8d61bf492 \ + --hash=sha256:773d5b5f2e2bd2c7cbb1bd24902ad41283c88b9dd463a0f82adc9a2870d9d066 \ + --hash=sha256:0f9193428a55a4347af2d4fd8141a2002dedbcc26487e67fd2ae19f977ee8afc \ + --hash=sha256:7bf652c73e8f7c32a3f92f7184bf7f9106dacdf5ef59c3c3683d7dae2c4972fb \ + --hash=sha256:c3c8b1ad2c266fdf7adc041cc4156d6a3d14db93de2f81b26a5af97ef3f209e5 \ + --hash=sha256:2383d6c3088e863304c37c65cd2ea404b7fbb4886823eab1d74137cc27f3d2ee \ + --hash=sha256:ae430d51c67ac638dfbb42edf56c669ca9c74744f4d225ad11c6f3d355858187 +distlib==0.3.4; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \ + --hash=sha256:6564fe0a8f51e734df6333d08b8b94d4ea8ee6b99b5ed50613f731fd4089f34b \ + --hash=sha256:e4b58818180336dc9c529bfb9a0b58728ffc09ad92027a3f30b7cd91e3458579 +entrypoints==0.3; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19 \ + --hash=sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451 +filelock==3.7.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \ + --hash=sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404 \ + --hash=sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04 +html5lib==1.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \ + --hash=sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d \ + --hash=sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f +idna==3.3; python_version >= "3.7" and python_version < "4" \ + --hash=sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff \ + --hash=sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d +importlib-metadata==4.12.0; python_version >= "3.7" and python_version < "3.8" and (python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "3.8" or python_version >= "3.7" and python_version < "3.8" and python_full_version >= "3.5.0") \ + --hash=sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23 \ + --hash=sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670 +jeepney==0.8.0; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \ + --hash=sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755 \ + --hash=sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806 +keyring==23.6.0; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:372ff2fc43ab779e3f87911c26e6c7acc8bb440cbd82683e383ca37594cb0617 \ + --hash=sha256:3ac00c26e4c93739e19103091a9986a9f79665a78cf15a4df1dba7ea9ac8da2f +lockfile==0.12.2; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:6c3cb24f344923d30b2785d5ad75182c8ea7ac1b6171b08657258ec7429d50fa \ + --hash=sha256:6aed02de03cba24efabcd600b30540140634fc06cfa603822d508d5361e9f799 +msgpack==1.0.4; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:4ab251d229d10498e9a2f3b1e68ef64cb393394ec477e3370c457f9430ce9250 \ + --hash=sha256:112b0f93202d7c0fef0b7810d465fde23c746a2d482e1e2de2aafd2ce1492c88 \ + --hash=sha256:002b5c72b6cd9b4bafd790f364b8480e859b4712e91f43014fe01e4f957b8467 \ + --hash=sha256:35bc0faa494b0f1d851fd29129b2575b2e26d41d177caacd4206d81502d4c6a6 \ + --hash=sha256:4733359808c56d5d7756628736061c432ded018e7a1dff2d35a02439043321aa \ + --hash=sha256:eb514ad14edf07a1dbe63761fd30f89ae79b42625731e1ccf5e1f1092950eaa6 \ + --hash=sha256:c23080fdeec4716aede32b4e0ef7e213c7b1093eede9ee010949f2a418ced6ba \ + --hash=sha256:49565b0e3d7896d9ea71d9095df15b7f75a035c49be733051c34762ca95bbf7e \ + --hash=sha256:aca0f1644d6b5a73eb3e74d4d64d5d8c6c3d577e753a04c9e9c87d07692c58db \ + --hash=sha256:0dfe3947db5fb9ce52aaea6ca28112a170db9eae75adf9339a1aec434dc954ef \ + --hash=sha256:4dea20515f660aa6b7e964433b1808d098dcfcabbebeaaad240d11f909298075 \ + --hash=sha256:e83f80a7fec1a62cf4e6c9a660e39c7f878f603737a0cdac8c13131d11d97f52 \ + --hash=sha256:3c11a48cf5e59026ad7cb0dc29e29a01b5a66a3e333dc11c04f7e991fc5510a9 \ + --hash=sha256:1276e8f34e139aeff1c77a3cefb295598b504ac5314d32c8c3d54d24fadb94c9 \ + --hash=sha256:6c9566f2c39ccced0a38d37c26cc3570983b97833c365a6044edef3574a00c08 \ + --hash=sha256:fcb8a47f43acc113e24e910399376f7277cf8508b27e5b88499f053de6b115a8 \ + --hash=sha256:76ee788122de3a68a02ed6f3a16bbcd97bc7c2e39bd4d94be2f1821e7c4a64e6 \ + --hash=sha256:0a68d3ac0104e2d3510de90a1091720157c319ceeb90d74f7b5295a6bee51bae \ + --hash=sha256:85f279d88d8e833ec015650fd15ae5eddce0791e1e8a59165318f371158efec6 \ + --hash=sha256:c1683841cd4fa45ac427c18854c3ec3cd9b681694caf5bff04edb9387602d661 \ + --hash=sha256:a75dfb03f8b06f4ab093dafe3ddcc2d633259e6c3f74bb1b01996f5d8aa5868c \ + --hash=sha256:9667bdfdf523c40d2511f0e98a6c9d3603be6b371ae9a238b7ef2dc4e7a427b0 \ + --hash=sha256:11184bc7e56fd74c00ead4f9cc9a3091d62ecb96e97653add7a879a14b003227 \ + --hash=sha256:ac5bd7901487c4a1dd51a8c58f2632b15d838d07ceedaa5e4c080f7190925bff \ + --hash=sha256:1e91d641d2bfe91ba4c52039adc5bccf27c335356055825c7f88742c8bb900dd \ + --hash=sha256:2a2df1b55a78eb5f5b7d2a4bb221cd8363913830145fad05374a80bf0877cb1e \ + --hash=sha256:545e3cf0cf74f3e48b470f68ed19551ae6f9722814ea969305794645da091236 \ + --hash=sha256:2cc5ca2712ac0003bcb625c96368fd08a0f86bbc1a5578802512d87bc592fe44 \ + --hash=sha256:eba96145051ccec0ec86611fe9cf693ce55f2a3ce89c06ed307de0e085730ec1 \ + --hash=sha256:7760f85956c415578c17edb39eed99f9181a48375b0d4a94076d84148cf67b2d \ + --hash=sha256:449e57cc1ff18d3b444eb554e44613cffcccb32805d16726a5494038c3b93dab \ + --hash=sha256:d603de2b8d2ea3f3bcb2efe286849aa7a81531abc52d8454da12f46235092bcb \ + --hash=sha256:48f5d88c99f64c456413d74a975bd605a9b0526293218a3b77220a2c15458ba9 \ + --hash=sha256:6916c78f33602ecf0509cc40379271ba0f9ab572b066bd4bdafd7434dee4bc6e \ + --hash=sha256:81fc7ba725464651190b196f3cd848e8553d4d510114a954681fd0b9c479d7e1 \ + --hash=sha256:d5b5b962221fa2c5d3a7f8133f9abffc114fe218eb4365e40f17732ade576c8e \ + --hash=sha256:77ccd2af37f3db0ea59fb280fa2165bf1b096510ba9fe0cc2bf8fa92a22fdb43 \ + --hash=sha256:b17be2478b622939e39b816e0aa8242611cc8d3583d1cd8ec31b249f04623243 \ + --hash=sha256:2bb8cdf50dd623392fa75525cce44a65a12a00c98e1e37bf0fb08ddce2ff60d2 \ + --hash=sha256:26b8feaca40a90cbe031b03d82b2898bf560027160d3eae1423f4a67654ec5d6 \ + --hash=sha256:462497af5fd4e0edbb1559c352ad84f6c577ffbbb708566a0abaaa84acd9f3ae \ + --hash=sha256:2999623886c5c02deefe156e8f869c3b0aaeba14bfc50aa2486a0415178fce55 \ + --hash=sha256:f0029245c51fd9473dc1aede1160b0a29f4a912e6b1dd353fa6d317085b219da \ + --hash=sha256:ed6f7b854a823ea44cf94919ba3f727e230da29feb4a99711433f25800cf747f \ + --hash=sha256:0df96d6eaf45ceca04b3f3b4b111b86b33785683d682c655063ef8057d61fd92 \ + --hash=sha256:6a4192b1ab40f8dca3f2877b70e63799d95c62c068c84dc028b40a6cb03ccd0f \ + --hash=sha256:0e3590f9fb9f7fbc36df366267870e77269c03172d086fa76bb4eba8b2b46624 \ + --hash=sha256:1576bd97527a93c44fa856770197dec00d223b0b9f36ef03f65bac60197cedf8 \ + --hash=sha256:63e29d6e8c9ca22b21846234913c3466b7e4ee6e422f205a2988083de3b08cae \ + --hash=sha256:fb62ea4b62bfcb0b380d5680f9a4b3f9a2d166d9394e9bbd9666c0ee09a3645c \ + --hash=sha256:4d5834a2a48965a349da1c5a79760d94a1a0172fbb5ab6b5b33cbf8447e109ce \ + --hash=sha256:f5d869c18f030202eb412f08b28d2afeea553d6613aee89e200d7aca7ef01f5f +packaging==20.9; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.4.0" \ + --hash=sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a \ + --hash=sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5 +pexpect==4.8.0; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937 \ + --hash=sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c +pkginfo==1.8.3; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.6.0" \ + --hash=sha256:848865108ec99d4901b2f7e84058b6e7660aae8ae10164e015a6dcf5b242a594 \ + --hash=sha256:a84da4318dd86f870a9447a8c98340aa06216bfc6f2b7bdc4b8766984ae1867c +platformdirs==2.5.2; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \ + --hash=sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788 \ + --hash=sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19 +poetry-core==1.1.0b2; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:48ef71ff8a4c2f0b4eaf9c138c12feb96dbf32e65baac8ca673769d05edf142f \ + --hash=sha256:4967fe08f745291b353328d4226d378a1731de2997a25b7a0c891e302460108d +poetry==1.2.0b1; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:e3d68c88492550c48df10c738e962f1f770ad71e715bab878a46f527e1ce81d2 \ + --hash=sha256:26cf8d309a74fff25d768219c2215a989a530acab886c01de3db07ab70bc7abf +ptyprocess==0.7.0; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 \ + --hash=sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220 +pycparser==2.21; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" and sys_platform == "linux" or python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" and python_full_version >= "3.4.0" \ + --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ + --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 +pylev==1.4.0; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:7b2e2aa7b00e05bb3f7650eb506fc89f474f70493271a35c242d9a92188ad3dd \ + --hash=sha256:9e77e941042ad3a4cc305dcdf2b2dec1aec2fbe3dd9015d2698ad02b173006d1 +pyparsing==3.0.9; python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.6.8" \ + --hash=sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc \ + --hash=sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb +pywin32-ctypes==0.2.0; python_version >= "3.7" and python_version < "4.0" and sys_platform == "win32" \ + --hash=sha256:24ffc3b341d457d48e8922352130cf2644024a4ff09762a2261fd34c36ee5942 \ + --hash=sha256:9dc2d991b3479cc2df15930958b674a48a227d5361d413827a4cfd0b5876fc98 +requests-toolbelt==0.9.1; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0 \ + --hash=sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f +requests==2.28.1; python_version >= "3.7" and python_version < "4" \ + --hash=sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349 \ + --hash=sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983 +secretstorage==3.3.2; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \ + --hash=sha256:755dc845b6ad76dcbcbc07ea3da75ae54bb1ea529eb72d15f83d26499a5df319 \ + --hash=sha256:0a8eb9645b320881c222e827c26f4cfcf55363e8b374a021981ef886657a912f +shellingham==1.4.0; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:536b67a0697f2e4af32ab176c00a50ac2899c5a05e0d8e2dadac8e58888283f9 \ + --hash=sha256:4855c2458d6904829bd34c299f11fdeed7cfefbf8a2c522e4caea6cd76b3171e +six==1.16.0; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 +tomlkit==0.11.0; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:0f4050db66fd445b885778900ce4dd9aea8c90c4721141fde0d6ade893820ef1 \ + --hash=sha256:71ceb10c0eefd8b8f11fe34e8a51ad07812cb1dc3de23247425fbc9ddc47b9dd +typing-extensions==4.3.0; python_version >= "3.7" and python_version < "3.8" \ + --hash=sha256:25642c956049920a5aa49edcdd6ab1e06d7e5d467fc00e0506c44ac86fbfca02 \ + --hash=sha256:e6d2677a32f47fc7eb2795db1dd15c1f34eff616bcaf2cfb5e997f854fa1c4a6 +urllib3==1.26.9; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_full_version >= "3.5.0" and python_version < "4" and python_version >= "3.7" \ + --hash=sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14 \ + --hash=sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e +virtualenv==20.15.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \ + --hash=sha256:b30aefac647e86af6d82bfc944c556f8f1a9c90427b2fb4e3bfbf338cb82becf \ + --hash=sha256:288171134a2ff3bfb1a2f54f119e77cd1b81c29fc1265a2356f3e8d14c7d58c4 +webencodings==0.5.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \ + --hash=sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78 \ + --hash=sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923 +zipp==3.8.0; python_version >= "3.7" and python_version < "3.8" \ + --hash=sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099 \ + --hash=sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad +pip==22.1.2 --hash=sha256:a3edacb89022ef5258bf61852728bf866632a394da837ca49eb4303635835f17 +setuptools==62.6.0 --hash=sha256:c1848f654aea2e3526d17fc3ce6aeaa5e7e24e66e645b5be2171f3f6b4e5a178 diff --git a/docker/python/bootstrap/lockfiles/constraints-3.8.txt b/docker/python/bootstrap/lockfiles/constraints-3.8.txt new file mode 100644 index 000000000000..f15e0e427cd4 --- /dev/null +++ b/docker/python/bootstrap/lockfiles/constraints-3.8.txt @@ -0,0 +1,251 @@ +cachecontrol==0.12.11; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:2c75d6a8938cb1933c75c50184549ad42728a27e9f6b92fd677c3151aa72555b \ + --hash=sha256:a5b9fcc986b184db101aa280b42ecdcdfc524892596f606858e0b7a8b4d9e144 +cachy==0.3.0; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.4.0" \ + --hash=sha256:338ca09c8860e76b275aff52374330efedc4d5a5e45dc1c5b539c1ead0786fe7 \ + --hash=sha256:186581f4ceb42a0bbe040c407da73c14092379b1e4c0e327fdb72ae4a9b269b1 +certifi==2022.6.15; python_version >= "3.7" and python_version < "4" \ + --hash=sha256:fe86415d55e84719d75f8b69414f6438ac3547d2078ab91b67e779ef69378412 \ + --hash=sha256:84c85a9078b11105f04f3036a9482ae10e4621616db313fe045dd24743a0820d +cffi==1.15.1; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \ + --hash=sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2 \ + --hash=sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2 \ + --hash=sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914 \ + --hash=sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3 \ + --hash=sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e \ + --hash=sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162 \ + --hash=sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b \ + --hash=sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21 \ + --hash=sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185 \ + --hash=sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd \ + --hash=sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc \ + --hash=sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f \ + --hash=sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e \ + --hash=sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4 \ + --hash=sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01 \ + --hash=sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e \ + --hash=sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2 \ + --hash=sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d \ + --hash=sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac \ + --hash=sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83 \ + --hash=sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9 \ + --hash=sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c \ + --hash=sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325 \ + --hash=sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c \ + --hash=sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef \ + --hash=sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8 \ + --hash=sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d \ + --hash=sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104 \ + --hash=sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7 \ + --hash=sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6 \ + --hash=sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d \ + --hash=sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a \ + --hash=sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405 \ + --hash=sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e \ + --hash=sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf \ + --hash=sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497 \ + --hash=sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375 \ + --hash=sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e \ + --hash=sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82 \ + --hash=sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b \ + --hash=sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c \ + --hash=sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426 \ + --hash=sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9 \ + --hash=sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045 \ + --hash=sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3 \ + --hash=sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a \ + --hash=sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5 \ + --hash=sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca \ + --hash=sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02 \ + --hash=sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192 \ + --hash=sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314 \ + --hash=sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5 \ + --hash=sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585 \ + --hash=sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0 \ + --hash=sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415 \ + --hash=sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d \ + --hash=sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984 \ + --hash=sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35 \ + --hash=sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27 \ + --hash=sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76 \ + --hash=sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3 \ + --hash=sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee \ + --hash=sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c \ + --hash=sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9 +charset-normalizer==2.1.0; python_version >= "3.7" and python_version < "4" and python_full_version >= "3.6.0" \ + --hash=sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413 \ + --hash=sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5 +cleo==1.0.0a5; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:ff53056589300976e960f75afb792dfbfc9c78dcbb5a448e207a17b643826360 \ + --hash=sha256:097c9d0e0332fd53cc89fc11eb0a6ba0309e6a3933c08f7b38558555486925d3 +crashtest==0.3.1; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:300f4b0825f57688b47b6d70c6a31de33512eb2fa1ac614f780939aa0cf91680 \ + --hash=sha256:42ca7b6ce88b6c7433e2ce47ea884e91ec93104a4b754998be498a8e6c3d37dd +cryptography==37.0.3; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \ + --hash=sha256:d10413d493e98075060d3e62e5826de372912ea653ccc948f3c41b21ddca087f \ + --hash=sha256:cd64147ff16506632893ceb2569624b48c84daa3ba4d89695f7c7bc24188eee9 \ + --hash=sha256:17c74f7d9e9e9bb7e84521243695c1b4bdc3a0e44ca764e6bcf8f05f3de3d0df \ + --hash=sha256:0713bee6c8077786c56bdec9c5d3f099d40d2c862ff3200416f6862e9dd63156 \ + --hash=sha256:b9c2008417741cdfbe945ef2d16b7b7ba0790886a0b49e1de533acf93eb66ed6 \ + --hash=sha256:646905ff7a712e415bf0d0f214e0eb669dd2257c4d7a27db1e8baec5d2a1d55f \ + --hash=sha256:dcafadb5a06cb7a6bb49fb4c1de7414ee2f8c8e12b047606d97c3175d690f582 \ + --hash=sha256:0b4bfc5ccfe4e5c7de535670680398fed4a0bbc5dfd52b3a295baad42230abdf \ + --hash=sha256:a03dbc0d8ce8c1146c177cd0e3a66ea106f36733fb1b997ea4d051f8a68539ff \ + --hash=sha256:190a24c14e91c1fa3101069aac7e77d11c5a73911c3904128367f52946bbb6fd \ + --hash=sha256:b05c5478524deb7a019e240f2a970040c4b0f01f58f0425e6262c96b126c6a3e \ + --hash=sha256:891ed8312840fd43e0696468a6520a582a033c0109f7b14b96067bfe1123226b \ + --hash=sha256:30d6aabf623a01affc7c0824936c3dde6590076b61f5dd299df3cc2c75fc5915 \ + --hash=sha256:31a7c1f1c2551f013d4294d06e22848e2ccd77825f0987cba3239df6ebf7b020 \ + --hash=sha256:a94fd1ff80001cb97add71d07f596d8b865b716f25ef501183e0e199390e50d3 \ + --hash=sha256:8a85dbcc770256918b40c2f40bd3ffd3b2ae45b0cf19068b561db8f8d61bf492 \ + --hash=sha256:773d5b5f2e2bd2c7cbb1bd24902ad41283c88b9dd463a0f82adc9a2870d9d066 \ + --hash=sha256:0f9193428a55a4347af2d4fd8141a2002dedbcc26487e67fd2ae19f977ee8afc \ + --hash=sha256:7bf652c73e8f7c32a3f92f7184bf7f9106dacdf5ef59c3c3683d7dae2c4972fb \ + --hash=sha256:c3c8b1ad2c266fdf7adc041cc4156d6a3d14db93de2f81b26a5af97ef3f209e5 \ + --hash=sha256:2383d6c3088e863304c37c65cd2ea404b7fbb4886823eab1d74137cc27f3d2ee \ + --hash=sha256:ae430d51c67ac638dfbb42edf56c669ca9c74744f4d225ad11c6f3d355858187 +distlib==0.3.4; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \ + --hash=sha256:6564fe0a8f51e734df6333d08b8b94d4ea8ee6b99b5ed50613f731fd4089f34b \ + --hash=sha256:e4b58818180336dc9c529bfb9a0b58728ffc09ad92027a3f30b7cd91e3458579 +entrypoints==0.3; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19 \ + --hash=sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451 +filelock==3.7.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \ + --hash=sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404 \ + --hash=sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04 +html5lib==1.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \ + --hash=sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d \ + --hash=sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f +idna==3.3; python_version >= "3.7" and python_version < "4" \ + --hash=sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff \ + --hash=sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d +importlib-metadata==4.12.0; python_version >= "3.7" and python_version < "3.10" \ + --hash=sha256:7401a975809ea1fdc658c3aa4f78cc2195a0e019c5cbc4c06122884e9ae80c23 \ + --hash=sha256:637245b8bab2b6502fcbc752cc4b7a6f6243bb02b31c5c26156ad103d3d45670 +jeepney==0.8.0; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \ + --hash=sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755 \ + --hash=sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806 +keyring==23.6.0; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:372ff2fc43ab779e3f87911c26e6c7acc8bb440cbd82683e383ca37594cb0617 \ + --hash=sha256:3ac00c26e4c93739e19103091a9986a9f79665a78cf15a4df1dba7ea9ac8da2f +lockfile==0.12.2; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:6c3cb24f344923d30b2785d5ad75182c8ea7ac1b6171b08657258ec7429d50fa \ + --hash=sha256:6aed02de03cba24efabcd600b30540140634fc06cfa603822d508d5361e9f799 +msgpack==1.0.4; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:4ab251d229d10498e9a2f3b1e68ef64cb393394ec477e3370c457f9430ce9250 \ + --hash=sha256:112b0f93202d7c0fef0b7810d465fde23c746a2d482e1e2de2aafd2ce1492c88 \ + --hash=sha256:002b5c72b6cd9b4bafd790f364b8480e859b4712e91f43014fe01e4f957b8467 \ + --hash=sha256:35bc0faa494b0f1d851fd29129b2575b2e26d41d177caacd4206d81502d4c6a6 \ + --hash=sha256:4733359808c56d5d7756628736061c432ded018e7a1dff2d35a02439043321aa \ + --hash=sha256:eb514ad14edf07a1dbe63761fd30f89ae79b42625731e1ccf5e1f1092950eaa6 \ + --hash=sha256:c23080fdeec4716aede32b4e0ef7e213c7b1093eede9ee010949f2a418ced6ba \ + --hash=sha256:49565b0e3d7896d9ea71d9095df15b7f75a035c49be733051c34762ca95bbf7e \ + --hash=sha256:aca0f1644d6b5a73eb3e74d4d64d5d8c6c3d577e753a04c9e9c87d07692c58db \ + --hash=sha256:0dfe3947db5fb9ce52aaea6ca28112a170db9eae75adf9339a1aec434dc954ef \ + --hash=sha256:4dea20515f660aa6b7e964433b1808d098dcfcabbebeaaad240d11f909298075 \ + --hash=sha256:e83f80a7fec1a62cf4e6c9a660e39c7f878f603737a0cdac8c13131d11d97f52 \ + --hash=sha256:3c11a48cf5e59026ad7cb0dc29e29a01b5a66a3e333dc11c04f7e991fc5510a9 \ + --hash=sha256:1276e8f34e139aeff1c77a3cefb295598b504ac5314d32c8c3d54d24fadb94c9 \ + --hash=sha256:6c9566f2c39ccced0a38d37c26cc3570983b97833c365a6044edef3574a00c08 \ + --hash=sha256:fcb8a47f43acc113e24e910399376f7277cf8508b27e5b88499f053de6b115a8 \ + --hash=sha256:76ee788122de3a68a02ed6f3a16bbcd97bc7c2e39bd4d94be2f1821e7c4a64e6 \ + --hash=sha256:0a68d3ac0104e2d3510de90a1091720157c319ceeb90d74f7b5295a6bee51bae \ + --hash=sha256:85f279d88d8e833ec015650fd15ae5eddce0791e1e8a59165318f371158efec6 \ + --hash=sha256:c1683841cd4fa45ac427c18854c3ec3cd9b681694caf5bff04edb9387602d661 \ + --hash=sha256:a75dfb03f8b06f4ab093dafe3ddcc2d633259e6c3f74bb1b01996f5d8aa5868c \ + --hash=sha256:9667bdfdf523c40d2511f0e98a6c9d3603be6b371ae9a238b7ef2dc4e7a427b0 \ + --hash=sha256:11184bc7e56fd74c00ead4f9cc9a3091d62ecb96e97653add7a879a14b003227 \ + --hash=sha256:ac5bd7901487c4a1dd51a8c58f2632b15d838d07ceedaa5e4c080f7190925bff \ + --hash=sha256:1e91d641d2bfe91ba4c52039adc5bccf27c335356055825c7f88742c8bb900dd \ + --hash=sha256:2a2df1b55a78eb5f5b7d2a4bb221cd8363913830145fad05374a80bf0877cb1e \ + --hash=sha256:545e3cf0cf74f3e48b470f68ed19551ae6f9722814ea969305794645da091236 \ + --hash=sha256:2cc5ca2712ac0003bcb625c96368fd08a0f86bbc1a5578802512d87bc592fe44 \ + --hash=sha256:eba96145051ccec0ec86611fe9cf693ce55f2a3ce89c06ed307de0e085730ec1 \ + --hash=sha256:7760f85956c415578c17edb39eed99f9181a48375b0d4a94076d84148cf67b2d \ + --hash=sha256:449e57cc1ff18d3b444eb554e44613cffcccb32805d16726a5494038c3b93dab \ + --hash=sha256:d603de2b8d2ea3f3bcb2efe286849aa7a81531abc52d8454da12f46235092bcb \ + --hash=sha256:48f5d88c99f64c456413d74a975bd605a9b0526293218a3b77220a2c15458ba9 \ + --hash=sha256:6916c78f33602ecf0509cc40379271ba0f9ab572b066bd4bdafd7434dee4bc6e \ + --hash=sha256:81fc7ba725464651190b196f3cd848e8553d4d510114a954681fd0b9c479d7e1 \ + --hash=sha256:d5b5b962221fa2c5d3a7f8133f9abffc114fe218eb4365e40f17732ade576c8e \ + --hash=sha256:77ccd2af37f3db0ea59fb280fa2165bf1b096510ba9fe0cc2bf8fa92a22fdb43 \ + --hash=sha256:b17be2478b622939e39b816e0aa8242611cc8d3583d1cd8ec31b249f04623243 \ + --hash=sha256:2bb8cdf50dd623392fa75525cce44a65a12a00c98e1e37bf0fb08ddce2ff60d2 \ + --hash=sha256:26b8feaca40a90cbe031b03d82b2898bf560027160d3eae1423f4a67654ec5d6 \ + --hash=sha256:462497af5fd4e0edbb1559c352ad84f6c577ffbbb708566a0abaaa84acd9f3ae \ + --hash=sha256:2999623886c5c02deefe156e8f869c3b0aaeba14bfc50aa2486a0415178fce55 \ + --hash=sha256:f0029245c51fd9473dc1aede1160b0a29f4a912e6b1dd353fa6d317085b219da \ + --hash=sha256:ed6f7b854a823ea44cf94919ba3f727e230da29feb4a99711433f25800cf747f \ + --hash=sha256:0df96d6eaf45ceca04b3f3b4b111b86b33785683d682c655063ef8057d61fd92 \ + --hash=sha256:6a4192b1ab40f8dca3f2877b70e63799d95c62c068c84dc028b40a6cb03ccd0f \ + --hash=sha256:0e3590f9fb9f7fbc36df366267870e77269c03172d086fa76bb4eba8b2b46624 \ + --hash=sha256:1576bd97527a93c44fa856770197dec00d223b0b9f36ef03f65bac60197cedf8 \ + --hash=sha256:63e29d6e8c9ca22b21846234913c3466b7e4ee6e422f205a2988083de3b08cae \ + --hash=sha256:fb62ea4b62bfcb0b380d5680f9a4b3f9a2d166d9394e9bbd9666c0ee09a3645c \ + --hash=sha256:4d5834a2a48965a349da1c5a79760d94a1a0172fbb5ab6b5b33cbf8447e109ce \ + --hash=sha256:f5d869c18f030202eb412f08b28d2afeea553d6613aee89e200d7aca7ef01f5f +packaging==20.9; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.4.0" \ + --hash=sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a \ + --hash=sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5 +pexpect==4.8.0; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937 \ + --hash=sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c +pkginfo==1.8.3; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.6.0" \ + --hash=sha256:848865108ec99d4901b2f7e84058b6e7660aae8ae10164e015a6dcf5b242a594 \ + --hash=sha256:a84da4318dd86f870a9447a8c98340aa06216bfc6f2b7bdc4b8766984ae1867c +platformdirs==2.5.2; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \ + --hash=sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788 \ + --hash=sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19 +poetry-core==1.1.0b2; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:48ef71ff8a4c2f0b4eaf9c138c12feb96dbf32e65baac8ca673769d05edf142f \ + --hash=sha256:4967fe08f745291b353328d4226d378a1731de2997a25b7a0c891e302460108d +poetry==1.2.0b1; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:e3d68c88492550c48df10c738e962f1f770ad71e715bab878a46f527e1ce81d2 \ + --hash=sha256:26cf8d309a74fff25d768219c2215a989a530acab886c01de3db07ab70bc7abf +ptyprocess==0.7.0; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 \ + --hash=sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220 +pycparser==2.21; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" and sys_platform == "linux" or python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" and python_full_version >= "3.4.0" \ + --hash=sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9 \ + --hash=sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206 +pylev==1.4.0; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:7b2e2aa7b00e05bb3f7650eb506fc89f474f70493271a35c242d9a92188ad3dd \ + --hash=sha256:9e77e941042ad3a4cc305dcdf2b2dec1aec2fbe3dd9015d2698ad02b173006d1 +pyparsing==3.0.9; python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.6.8" \ + --hash=sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc \ + --hash=sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb +pywin32-ctypes==0.2.0; python_version >= "3.7" and python_version < "4.0" and sys_platform == "win32" \ + --hash=sha256:24ffc3b341d457d48e8922352130cf2644024a4ff09762a2261fd34c36ee5942 \ + --hash=sha256:9dc2d991b3479cc2df15930958b674a48a227d5361d413827a4cfd0b5876fc98 +requests-toolbelt==0.9.1; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0 \ + --hash=sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f +requests==2.28.1; python_version >= "3.7" and python_version < "4" \ + --hash=sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349 \ + --hash=sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983 +secretstorage==3.3.2; python_version >= "3.7" and python_version < "4.0" and sys_platform == "linux" \ + --hash=sha256:755dc845b6ad76dcbcbc07ea3da75ae54bb1ea529eb72d15f83d26499a5df319 \ + --hash=sha256:0a8eb9645b320881c222e827c26f4cfcf55363e8b374a021981ef886657a912f +shellingham==1.4.0; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:536b67a0697f2e4af32ab176c00a50ac2899c5a05e0d8e2dadac8e58888283f9 \ + --hash=sha256:4855c2458d6904829bd34c299f11fdeed7cfefbf8a2c522e4caea6cd76b3171e +six==1.16.0; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \ + --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 \ + --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 +tomlkit==0.11.0; python_version >= "3.7" and python_version < "4.0" \ + --hash=sha256:0f4050db66fd445b885778900ce4dd9aea8c90c4721141fde0d6ade893820ef1 \ + --hash=sha256:71ceb10c0eefd8b8f11fe34e8a51ad07812cb1dc3de23247425fbc9ddc47b9dd +urllib3==1.26.9; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_full_version >= "3.5.0" and python_version < "4" and python_version >= "3.7" \ + --hash=sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14 \ + --hash=sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e +virtualenv==20.15.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \ + --hash=sha256:b30aefac647e86af6d82bfc944c556f8f1a9c90427b2fb4e3bfbf338cb82becf \ + --hash=sha256:288171134a2ff3bfb1a2f54f119e77cd1b81c29fc1265a2356f3e8d14c7d58c4 +webencodings==0.5.1; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.5.0" \ + --hash=sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78 \ + --hash=sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923 +zipp==3.8.0; python_version >= "3.7" and python_version < "3.10" \ + --hash=sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099 \ + --hash=sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad +pip==22.1.2 --hash=sha256:a3edacb89022ef5258bf61852728bf866632a394da837ca49eb4303635835f17 +setuptools==62.6.0 --hash=sha256:c1848f654aea2e3526d17fc3ce6aeaa5e7e24e66e645b5be2171f3f6b4e5a178 diff --git a/docker/python/bootstrap/lockfiles/requirements-3.7.txt b/docker/python/bootstrap/lockfiles/requirements-3.7.txt new file mode 100644 index 000000000000..43a3c2405739 --- /dev/null +++ b/docker/python/bootstrap/lockfiles/requirements-3.7.txt @@ -0,0 +1,3 @@ +pip +poetry +setuptools diff --git a/docker/python/bootstrap/lockfiles/requirements-3.8.txt b/docker/python/bootstrap/lockfiles/requirements-3.8.txt new file mode 100644 index 000000000000..43a3c2405739 --- /dev/null +++ b/docker/python/bootstrap/lockfiles/requirements-3.8.txt @@ -0,0 +1,3 @@ +pip +poetry +setuptools diff --git a/docker/python/ci-constraints.txt b/docker/python/ci-constraints.txt new file mode 100644 index 000000000000..6e586b14ae3d --- /dev/null +++ b/docker/python/ci-constraints.txt @@ -0,0 +1,39 @@ +# This file lists packages we intentionally hold back in CI for no reason other than that +# updates outside of these bounds require a considerable amount of work, and allowing them to float +# freely would mean that small changes to the TVM dependency set could be held up behind large +# migration tasks if a new version of these packages were to be released. Holding packages back +# here allows us to decide when to tackle such migration work. +#keras = "^2.6.0" +#mxnet = "^1.6.0" + +#black = "<21.8b0" # Breaks tensorflow-gpu. Revisit when tensorflow is upgraded. +blocklint = "==0.2.3" +#commonmark = ">=0.7.3" +cpplint = "==1.6.0" +#docutils = ">=0.11,<0.17" +#ethos-u-vela = "==3.2.0" +flake8 = "==3.9.2" +flowvision = "==0.1.0" +#h5py = "==3.1.0" +keras = "==2.7" +jinja2 = "==3.0.3" +mxnet = "==1.6.0" +mypy = "==0.902" +oneflow = "==0.7.0" +onnx = "==1.10.2" +onnxruntime = "==1.9.0" +numpy = "==1.19.3" +paddlepaddle = "==2.1.3" +pillow = "==9.1.0" +pylint = "==2.4.4" +scipy = "==1.7.3" +sphinx = "==4.2.0" +sphinx-gallery = "==0.4.0" +tensorflow = "==2.7.2" +tensorflow-aarch64 = "==2.7.2" +tensorflow-estimator = "==2.7.0" +tensorflow-gpu = "==2.7.2" +tflite = "==2.4.0" +torch = "==1.11.0" +torchvision = "==0.12.0+cpu" +#xgboost = "==1.4.2" diff --git a/docker/with_the_same_user b/docker/with_the_same_user index 71e701dcfb59..397b885ee166 100644 --- a/docker/with_the_same_user +++ b/docker/with_the_same_user @@ -25,7 +25,13 @@ set -e -COMMAND=("$@") +# NOTE: sudo uses the env_reset option to reset environment variables to a secure bare minimum. +# The --preserve-env option below passes those variables through to the invoked process; however, +# this appears not to affect the environment used with execve, so we resolve the binary to run +# in this file using the $PATH specified in the Dockerfile. +COMMAND=( "$(which "$1")" ) +shift +COMMAND=( "${COMMAND[@]}" "$@" ) if ! touch /this_is_writable_file_system; then echo "You can't write to your filesystem!" @@ -41,21 +47,23 @@ getent group "${CI_BUILD_GID}" || ( if grep -q "^${CI_BUILD_GROUP}:" /etc/group; then CI_BUILD_GROUP="${CI_BUILD_GROUP}2" fi - addgroup --force-badname --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}") + addgroup --force-badname --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" >/dev/null) + +getent group tvm-venv || (addgroup tvm-venv >/dev/null) getent passwd "${CI_BUILD_UID}" || adduser --force-badname --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \ --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \ --disabled-password --home "${CI_BUILD_HOME}" --quiet "${CI_BUILD_USER}" -usermod -a -G sudo "${CI_BUILD_USER}" +usermod -a -G sudo -G tvm-venv "${CI_BUILD_USER}" # Add user to video group for ROCm -if [[ ! -z $ROCM_ENABLED ]]; then +if [[ ! -z "${ROCM_ENABLED-}" ]]; then usermod -a -G video "${CI_BUILD_USER}" fi # This is a grotesque hack to get PYTEST_ADD_OPTS available to all task scripts. echo "${CI_BUILD_USER} ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-nopasswd-sudo -if [[ ! -z $CUDA_VISIBLE_DEVICES ]]; then +if [[ ! -z "${CUDA_VISIBLE_DEVICES-}" ]]; then CUDA_ENV="CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}" else CUDA_ENV="" @@ -65,8 +73,8 @@ sudo -u "#${CI_BUILD_UID}" --preserve-env \ ${CUDA_ENV} \ PATH=${PATH} \ JAVA_HOME=${JAVA_HOME} \ -LD_LIBRARY_PATH=${LD_LIBRARY_PATH} \ -PYTHONPATH=${PYTHONPATH} \ -CI_IMAGE_NAME=${CI_IMAGE_NAME} \ -HOME=${CI_BUILD_HOME} \ +LD_LIBRARY_PATH="${LD_LIBRARY_PATH-}" \ +PYTHONPATH="${PYTHONPATH-}" \ +CI_IMAGE_NAME="${CI_IMAGE_NAME-}" \ +HOME="${CI_BUILD_HOME-}" \ "${COMMAND[@]}" From b22b872da800b0b44feeca67e808319e21b840a2 Mon Sep 17 00:00:00 2001 From: Anirudh Sundar Date: Tue, 13 Sep 2022 00:44:40 +0530 Subject: [PATCH 151/704] [Hexagon] Add Hand written HVX conv2d (#12204) * [Hexagon] Add Hand written HVX conv2d Co-authored-by: Krzysztof Parzyszek * Address review comments Co-authored-by: Krzysztof Parzyszek * Add some more comments and a file rename * Add gtest unit tests for blockize/deblockize * Add gtest unit tests fp16 utils Co-authored-by: Krzysztof Parzyszek --- cmake/modules/Hexagon.cmake | 10 + include/tvm/runtime/hexagon/ops/conv2d.h | 198 +++++++ src/runtime/hexagon/ops/conv2d_fp16_hvx.cc | 489 ++++++++++++++++++ src/runtime/hexagon/ops/conv_utils.cc | 243 +++++++++ .../hexagon/hexagon_fp16_utils_tests.cc | 289 +++++++++++ .../topi/test_conv2d_fp16_intrin.py | 248 +++++++++ 6 files changed, 1477 insertions(+) create mode 100644 include/tvm/runtime/hexagon/ops/conv2d.h create mode 100644 src/runtime/hexagon/ops/conv2d_fp16_hvx.cc create mode 100644 src/runtime/hexagon/ops/conv_utils.cc create mode 100644 tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc create mode 100644 tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake index c08ea5eb1df1..aad770120201 100644 --- a/cmake/modules/Hexagon.cmake +++ b/cmake/modules/Hexagon.cmake @@ -172,6 +172,16 @@ if(BUILD_FOR_HEXAGON) list(APPEND TVM_RUNTIME_LINKER_LIBS -Wl,--whole-archive ${USE_HEXAGON_SDK}/libs/qhl/prebuilt/hexagon_toolv84_v68/libqhmath.a -Wl,--no-whole-archive) endif() + + # Hand-written ops + file_glob_append(RUNTIME_HEXAGON_SRCS + "${TVMRT_SOURCE_DIR}/hexagon/ops/*.cc" + ) + + set_source_files_properties( + "${TVMRT_SOURCE_DIR}/hexagon/ops/conv2d_fp16_hvx.cc" + PROPERTIES COMPILE_FLAGS "-mhvx" + ) endif() if(USE_HEXAGON_RPC) diff --git a/include/tvm/runtime/hexagon/ops/conv2d.h b/include/tvm/runtime/hexagon/ops/conv2d.h new file mode 100644 index 000000000000..d759149727e8 --- /dev/null +++ b/include/tvm/runtime/hexagon/ops/conv2d.h @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include + +#ifndef TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_ +#define TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_ + +namespace tvm { +namespace runtime { +namespace hexagon { +static constexpr auto hexagon_device = DLDevice{static_cast(kDLHexagon), 0}; + +// Standalone DLTensor: the standalone-ness means that this object owns the shape +// (as opposed to a DLTensor). +template +class SDLTensor : public DLTensor { + public: + SDLTensor(void* data_ptr, DLDataType data_type, void* data_space, const int64_t* data_dims) + : SDLTensor(data_ptr, data_type, data_space) { + for (size_t i = 0; i < NDIM; ++i) dims[i] = data_dims[i]; + } + + SDLTensor(void* data_ptr, DLDataType data_type, void* data_space, + std::initializer_list data_dims) + : SDLTensor(data_ptr, data_type, data_space, data_dims.begin()) {} + + void* GetDataSpace() const { return data_space; } + + private: + /** + * @brief Construct SDLTensor + * + * @param data_ptr Either points to the same memory as data_space or an array of pointers to the + * start of each chunk of weight. Since weights can be of varying sizes, this array could contain + * the pointer to each chunk of memory + * @param data_type data type of the elements in Tensor + * @param data_space is meant to store the pointer returned from AllocDataSpace and can be freed + * by passing it to FreeDataSpace + */ + SDLTensor(void* data_ptr, DLDataType data_type, void* data_space) : data_space(data_space) { + data = data_ptr; + device = hexagon_device; + ndim = NDIM; + dtype = data_type; + shape = dims; + strides = nullptr; + byte_offset = 0; + } + + void* data_space = nullptr; + int64_t dims[NDIM]; +}; + +inline void* to_ptr(uintptr_t v) { return reinterpret_cast(v); } + +inline uintptr_t to_uint(void* ptr) { return reinterpret_cast(ptr); } + +constexpr int xyc_to_sm_16b(int y, int x, int c) { + // Map y,x,c coordinates within a block to the offset (in 16-bit elements) + // from the beginning of the block in spatial-major layout. + // 10-bit spatial mask: yyyxcccccx + assert(y >= 0 && x >= 0 && c >= 0); + return y << 7 | (x & 2) << 5 | c << 1 | (x & 1); +} + +constexpr int hwio_to_sm_16b(int width, int y, int x, int i, int o) { + // Map y,x,i,o coordinates within a chunk (assuming the origin at the + // top-left spatial corner) to the offset (in 16-bit elements) from the + // beginning of the chunk in spatial-major layout. + // Spatial mask: p..piiiioooooi, where p..p are position bits. + assert(width >= 1); + assert(y >= 0 && x >= 0 && i >= 0 && o >= 0); + int p = y * width + (width - 1 - x); + return p << 10 | (i & 0x1e) << 5 | o << 1 | (i & 1); +} + +inline constexpr int round_up(int v, int p2) { return (v + p2 - 1) & -p2; } + +// Returns the block address at the given index +// Assumptions +// - The data type of tensor is fp16 +// - There is only one batch, and hence n==0 +inline uintptr_t nhwc_at(const DLTensor& a, int n, int y, int x, int c) { + if (y < 0 || y >= a.shape[1]) return uintptr_t(0); + auto p = static_cast(a.data); + assert(n == 0); + return p[y * a.shape[2] * a.shape[3] + x * a.shape[3] + c]; +} + +// Returns the address of the chunk stored at given index +// Assumptions +// - The data type of tensor is fp16 +inline uintptr_t hwio_at(const DLTensor& f, int y, int x, int i, int o) { + auto p = static_cast(f.data); + return p[y * f.shape[1] * f.shape[2] * f.shape[3] + x * f.shape[2] * f.shape[3] + i * f.shape[3] + + o]; +} + +/** + * @brief Function to "blockize" the flat input data + * The term "blockize" is used to mention that the data is stored in non-contiguous blocks + * + * The input is mapped into the below mentioned layout (notation similar to index map used for + * transform layout): + * + * lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, c%32, w%2 + * + * where AXIS_SEPARATOR represents split up in the physical layout + * + * @param out Pre-allocated output memory pointer + * @param inp_flat Flat input data pointer + * @param height + * @param width + * @param depth + */ +void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int depth); + +/** + * @brief Convert back from non-contguous layout to a flat layout + * + * @param out_flat Pre-allocated output memory pointer + * @param inp Blockized input data pointer + * @param height + * @param width + * @param depth + */ +void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int depth); + +/** + * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below: + * + * Weights are packed into the below mentioned layout (notation similar to index map): + * Since weights cannot be exactly represented into a index map notation, the + * base split up is mentioned below with a few gotchas + * + * lambda h, w, i, o: h//8, w//4, o//32, i//32, h%8, w%4, (i%32)//2, o%32, i%2 + * + * The gotchas are: + * - (w%4) is actually stored in the right to left order, as in 3,2,1,0 instead of 0,1,2,3 + * - The h%8 and (w%4) dimensions are not padded up, leading to chunks of different sizes + * (thereby the name "chunked" instead of packed) + * - The thinnest chunk of width is stored first. For example, if a kernel is 5x5, the first + * chunk along the width has size 1 (representing index 0) and then next one has size 4 + * representing indices (1,2,3,4) + * + * @param out_ptr Base pointer table to be filled with the list of pointers to the first addresses + * of the "chunked" weights + * @param out_ptr_size The number of chunks + * @param out Pointer to pre-allocated output memory + * @param inp Pointer to flat input data + * @param height + * @param width + * @param idepth + * @param odepth + */ +void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, + int width, int idepth, int odepth); + +SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat, + bool copy_data); + +int calculate_num_weight_chunks(int64_t* shape_hwio); + +SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat, + int num_chunks, void** ptr_table); + +template +void release(tvm::runtime::DeviceAPI* device_api, const SDLTensor& tensor) { + if (auto* data_space = tensor.GetDataSpace()) { + device_api->FreeDataSpace(hexagon_device, data_space); + } +} + +} // namespace hexagon +} // namespace runtime +} // namespace tvm + +#endif // TVM_RUNTIME_HEXAGON_OPS_CONV2D_H_ diff --git a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc new file mode 100644 index 000000000000..cf4dc43c6515 --- /dev/null +++ b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc @@ -0,0 +1,489 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "tvm/runtime/hexagon/ops/conv2d.h" + +// Current limitations: +// - N in NHWC must be 1 +// - dilated convolutions are not supported +// - Bias is not accepted +// - Optional "relu" is not performed + +// Packed arguments: +// 0: DLTensor activations (NHWC) +// 1: DLTensor weights (HWIO) +// 2: int offset_top +// 3: int offset_left +// 4: int stride_h +// 5: int stride_w +// 6: DLTensor output (NHWC) +extern "C" int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val, + int out_code, void* res_handle); + +namespace tvm { +namespace runtime { +namespace hexagon { + +/** + * @brief Returns the pointer to the element within the given block + * assuming fp16 type and speicific layout as mentioned in blockize_hwc_16b. + * All the below params are explained with the same layout assumption + * + * @param block_out_y y-index of block + * @param block_out_x x-index of block + * @param block_out_c c-index of block + * @param yi height-offset within the block + * @param xio outer width offset within the block + * @param ci channel offset within the block + * @param xii inner width offset within the block + * @param block base DLTensor + * + * @return The pointer to the element within the given block + */ +static inline uint16_t* getElementPtr(int block_out_y, int block_out_x, int block_out_c, int yi, + int xio, int ci, int xii, const DLTensor& tensor) { + auto block_ptr = nhwc_at(tensor, 0, block_out_y, block_out_x, block_out_c); + auto block_offset = yi * 128 + xio * 64 + ci * 2 + xii; + auto first_element_ptr = reinterpret_cast(block_ptr); + return first_element_ptr + block_offset; +} + +/** + * @brief Compute 2 vectors with ones in the even and odd lanes + * + * Output vectors are: + * vector 1 = [0xFFFF,0x0000,0xFFFFF,0x0000,...,0xFFFF,0x0000] + * vector lanes = [ 0 , 2 , 3 , 4 ,..., 62 , 63 ] + * + * vector 2 = [0x0000,0xFFFF,0x0000,0xFFFFF,...,0xFFFF,0x0000] + * vector lanes = [ 0 , 2 , 3 , 4 ,..., 62 , 63 ] + * + * @return Return the 2 vectors + */ +inline std::pair getOddEvenOnes() { + HVX_Vector v0 = Q6_V_vzero(); + HVX_Vector v1 = Q6_Vh_vsplat_R(0xFFFF); + + HVX_Vector v1e = Q6_Vh_vshuffe_VhVh(v0, v1); + HVX_Vector v1o = Q6_V_vnot_V(v1e); + return {v1e, v1o}; +} + +/** + * @brief Return the input vector filled with the 2 channel elements(which is the 1st and 3rd + * element) from base_ptr filled up 32 times to get 64 elements + * + * 1. It's generated by first creating 2 vectors "splatted" with the 2 required elements + * 2. Then we andd it with vectors containing all ones (0xFFFF) in the even and odd lanes + * 3. Finally those 2 vectors are OR'ed together + * + * @param base_ptr pointer to the first of the 2 channel elements to be filled + * + * @return input vector + */ +inline HVX_Vector getInputVector(uint16_t* base_ptr) { + HVX_Vector v1 = Q6_Vh_vsplat_R(base_ptr[0]); + HVX_Vector v2 = Q6_Vh_vsplat_R(base_ptr[2]); + + auto oddEvenOnes = getOddEvenOnes(); + auto v1e = oddEvenOnes.first; + auto v1o = oddEvenOnes.second; + + HVX_Vector v_even_vals = Q6_V_vand_VV(v1, v1e); + HVX_Vector v_odd_vals = Q6_V_vand_VV(v2, v1o); + + return Q6_V_vor_VV(v_even_vals, v_odd_vals); +} + +/** + * @brief Return the Output vector which contains the 32 output channels in the even lanes + * + * The output vector is commputed as: + * 1. vector multiply(vmpy) of input and weights + * 2. Rotate the vector right by 1 element and add with the first vector to add the 2 input channels + * 3. Then convert the results back from qfloat16 to IEEE half-precision float + * 4. The added values are in even lanes, so zero out the odd lanes by anding with ones in even + * lanes and return + * + * @param act_vec Input activations vector + * @param wgt_vec Weights vector + * + * @return output vector with 32 output channels even lanes + */ +inline HVX_Vector computeOuputVector(HVX_Vector act_vec, HVX_Vector wgt_vec) { + HVX_Vector v_res = Q6_Vqf16_vmpy_VhfVhf(act_vec, wgt_vec); // result is in qf16 + HVX_Vector v_rot = Q6_V_vror_VR(v_res, 2); + HVX_Vector v_reduced = Q6_Vqf16_vadd_Vqf16Vqf16(v_res, v_rot); + HVX_Vector v_hf = Q6_Vhf_equals_Vqf16(v_reduced); + HVX_Vector v1e = getOddEvenOnes().first; + HVX_Vector v_reduced_even_lanes = Q6_V_vand_VV(v_hf, v1e); + return v_reduced_even_lanes; +} + +static int round_down(int v, int base) { return v - (v % base); } + +/** + * @brief Compute the convolution of inputs from cr_act, and weights from + * cr_filt to update the output to cr_out. The goal is to have an efficient + * HVX implementation + * + * Assumptions: + * ----------- + * - This implementation right now assumes that the dilation is 1 + * - there is zero padding or the input was already pre-padded. + * - block specific spatial padding is only expected at the end and hence + * pad_top and pad_left are not yet used + * - Relu activation is not used + * - Bias add is not done + * + * @param cr_out blockized output tensor with zeros already filled in + * @param cr_act blockized activations + * @param cr_filt Chunkified weights as returned from output of prepare_hwio + * @param out_shape Original output shape of the tensor before blockization + * @param act_shape Original input shape + * @param bias_flat Flat bias values and are not used right now + * TODO (quic-sanirudh) Add support for bias add + * @param filt_shape Original filter shape + * @param pad_shape Pad top and pad left shape + * @param relu Whether to apply relu after convolution, not done right now + * TODO (quic-sanirudh) Add support for relu activation + * @param zero_block A block filled with zeros + * + * @return + */ +void conv_layer_fp16_hvx(DLTensor& cr_out, const DLTensor& cr_act, // NOLINT(*) + const DLTensor& cr_filt, const DLTensor& out_shape, + const DLTensor& act_shape, const DLTensor& bias_flat, + const DLTensor& filt_shape, const DLTensor& pad_shape, bool relu, + int stride_h, int stride_w, uintptr_t zero_block) { + int64_t filt_height = filt_shape.shape[0]; + int64_t filt_width = filt_shape.shape[1]; + int64_t filt_idepth = filt_shape.shape[2]; + + int pad_top = pad_shape.shape[0]; + int pad_left = pad_shape.shape[1]; + LOG_INFO << "filt_height=" << filt_height << ", filt_width=" << filt_width + << ", filt_idepth=" << filt_idepth << ", pad_top=" << pad_top + << ", pad_left=" << pad_left << "\n"; + + ICHECK_LT(pad_top, 8) << "pad_top offset cannot be >= 8"; + ICHECK_LT(pad_left, 4) << "pad_left offset cannot be >= 4"; + + int a_height = cr_act.shape[1]; + int a_width = cr_act.shape[2]; + int a_depth = cr_act.shape[3]; + + int w_height = cr_filt.shape[0]; + int w_width = cr_filt.shape[1]; + + int o_depth = cr_out.shape[3]; + int b_depth = bias_flat.shape[0]; + + int o_height = cr_out.shape[1]; + int o_width = cr_out.shape[2]; + + int out_height = out_shape.shape[1]; + int out_width = out_shape.shape[2]; + + LOG_INFO << "a: 1x" << a_height << "x" << a_width << "x" << a_depth << ", w: " << w_height << "x" + << w_width << "x" << static_cast(cr_filt.shape[2]) << "x" + << static_cast(cr_filt.shape[3]) << ", o: 1x" << o_height << "x" << o_width << "x" + << o_depth << ", b: " << b_depth << ", out_shape: " << out_height << "x" << out_width + << "\n"; + + ICHECK_EQ(a_depth, cr_filt.shape[2]) << "input depth should match weights input channels"; + ICHECK_EQ(o_depth, cr_filt.shape[3]) << "output depth should match the weights output channel"; + + int rd = round_down(filt_width, 4); + int wgt_chunk_thin_width = filt_width - rd; + + /* + * Compute the output vector of either 1 or 2 elements along the width and max 32 elements along + * the depth to constitue a maximum of 64 elements + * + * The weights are loaded directly in the order they're stored, which results + * in 2 input channels and 32 output channels + * + * Weights vector illustration: + * ------- ------ ------------ + * weights_vec = [0-0,0-1,1-0,1-1,2-0,2-1,3-0,3-1,4-0,4-1,...,31-0,31-1] -> This is the + * vector representation of weights, where the elements are represented as + * "out_channel-input_channel" + * + * + * Same 2 input channels have to be multiplied across all output channels in the weights. + * + * Activations vector would thus be: + * ----------- ------ ----- ---- -- + * act_vec = [i0,i1,i0,i1,i0,i1,...,i0,i1] - 2 elements of the input channels broadcasted 32 times + * to fill 64 elements of the vector + * + * + * Thus the computation is just a vmpy(act_vec,weights_vec) followed by a some rearrangement to + * add every pair of 16b lanes in the vector to reduce along the input channels + * + * This result is added to the result of the next pair of input channels all the way until we + * have reduced across the entire input channels. + * + * Then the same vector is added to the results of the following elements along the width and + * height to finally get 32 elements representing 32 output channels. + * + * Since the output block also has the 8h2w32c2w format, the 32 elements of the next element + * along the width is also added into the the same vector such that the first 32 channel elements + * occupy the even lanes and the next 32 occupy the odd lanes to form a single 64-element vector + * which is then stored + */ + auto computeConv = [filt_height, filt_width, wgt_chunk_thin_width, filt_idepth, stride_h, + stride_w, &cr_out, &cr_act, &cr_filt](int out_act_y, int out_act_x, int out_c, + int h, int wo, bool skip_wi_1 = false) { + auto out_element_ptr = getElementPtr(out_act_y, out_act_x, out_c, h, wo, 0, 0, cr_out); + + LOG_INFO << "out_act_y: " << out_act_y << ", out_act_x: " << out_act_x << ", out_c: " << out_c + << ", h: " << h << ", wo: " << wo << " out_element_ptr: " << out_element_ptr; + + HVX_Vector* out_vector = reinterpret_cast(out_element_ptr); + HVX_Vector existing_out_vec = *out_vector; + + for (int fh = 0; fh < filt_height; ++fh) { + for (int fw = 0; fw < filt_width; ++fw) { + int fch = fh / 8; + int fcw = 0; + if (fw >= wgt_chunk_thin_width) { + fcw = (fw - wgt_chunk_thin_width) / 4 + 1; + } + int fx = (fw < wgt_chunk_thin_width) ? fw : ((fw - wgt_chunk_thin_width) % 4); + int fy = fh % 8; + for (int c = 0; c < round_up(filt_idepth, 2); c += 2) { + int out_act_cc = c / 32; + int ci = c % 32; + auto wgt_chunk = hwio_at(cr_filt, fch, fcw, out_act_cc, out_c); + + // Find weight chunk offset ptr + int max_x = (fcw == 0) ? wgt_chunk_thin_width : 4; + + int wi = 0; + + int out_width_idx = out_act_x * 4 + wo * 2 + wi; + int act_width_access_idx = out_width_idx * stride_w + fw; + int true_out_act_x = act_width_access_idx / 4; + int true_wo = (act_width_access_idx % 4) / 2; + int true_wi = act_width_access_idx % 2; + + int out_height_idx = out_act_y * 8 + h; + int act_height_access_idx = out_height_idx * stride_h + fh; + int true_out_act_y = act_height_access_idx / 8; + int true_h = act_height_access_idx % 8; + + int act_channel_idx = out_act_cc * 32 + ci; + + auto act_element_ptr = getElementPtr(true_out_act_y, true_out_act_x, out_act_cc, true_h, + true_wo, ci, true_wi, cr_act); + HVX_Vector act_vec = getInputVector(act_element_ptr); + + auto wgt_chunk_offset = hwio_to_sm_16b(max_x, fy, fx, ci, 0); + auto base_chunk_ptr = reinterpret_cast(wgt_chunk); + auto chunk_ptr = base_chunk_ptr + wgt_chunk_offset; + + LOG_INFO << "act: 0x" << act_height_access_idx << "x" << act_width_access_idx << "x" + << act_channel_idx << ", wgt: " << fh << "x" << fw << "x" << act_channel_idx + << "x" << out_c * 32 << ", out: 0x" << out_height_idx << "x" << out_width_idx + << "x" << out_c * 32 << ", wgt_chunk_offset: " << wgt_chunk_offset; + + const HVX_Vector* weights_vec_ptr = reinterpret_cast(chunk_ptr); + HVX_Vector weights_vec = *weights_vec_ptr; + + HVX_Vector reduced_vec_even_elements = computeOuputVector(act_vec, weights_vec); + + if (!skip_wi_1) { + wi = 1; + + out_width_idx = out_act_x * 4 + wo * 2 + wi; + act_width_access_idx = out_width_idx * stride_w + fw; + true_out_act_x = act_width_access_idx / 4; + true_wo = (act_width_access_idx % 4) / 2; + true_wi = act_width_access_idx % 2; + + act_element_ptr = getElementPtr(true_out_act_y, true_out_act_x, out_act_cc, true_h, + true_wo, ci, true_wi, cr_act); + act_vec = getInputVector(act_element_ptr); + + LOG_INFO << "act: 0x" << act_height_access_idx << "x" << act_width_access_idx << "x" + << act_channel_idx << ", wgt: " << fh << "x" << fw << "x" << act_channel_idx + << "x" << out_c * 32 << ", out: 0x" << out_height_idx << "x" << out_width_idx + << "x" << out_c * 32 << ", wgt_chunk_offset: " << wgt_chunk_offset; + + HVX_Vector reduced_vec_odd_elements = computeOuputVector(act_vec, weights_vec); + reduced_vec_odd_elements = Q6_V_vror_VR(reduced_vec_odd_elements, -2); + HVX_Vector out_final = Q6_V_vor_VV(reduced_vec_even_elements, reduced_vec_odd_elements); + + HVX_Vector out_vec_qf16 = Q6_Vqf16_vadd_VhfVhf(out_final, existing_out_vec); + existing_out_vec = Q6_Vhf_equals_Vqf16(out_vec_qf16); + } else { + HVX_Vector out_vec_qf16 = + Q6_Vqf16_vadd_VhfVhf(reduced_vec_even_elements, existing_out_vec); + existing_out_vec = Q6_Vhf_equals_Vqf16(out_vec_qf16); + } + } + } + } + *out_vector = existing_out_vec; + }; + + auto computeFullWidth = [&computeConv](int out_y, int out_x, int out_c, int h) { + for (int wo = 0; wo < 2; ++wo) { + computeConv(out_y, out_x, out_c, h, wo); + } + }; + + auto computePartialWidth = [out_width, o_width, &computeConv](int out_y, int out_c, int h) { + int out_x = o_width - 1; + int wo = 0; + for (; wo < (out_width % 4) / 2; ++wo) { + computeConv(out_y, out_x, out_c, h, wo); + } + + if (out_width % 2) { + computeConv(out_y, out_x, out_c, h, wo, true /* skip_wi_1 */); + } + }; + + for (int out_c = 0; out_c < cr_filt.shape[3]; ++out_c) { + for (int out_act_y = 0; out_act_y < out_height / 8; ++out_act_y) { + int out_y = out_act_y; + for (int out_act_x = 0; out_act_x < out_width / 4; ++out_act_x) { + int out_x = out_act_x; + for (int h = 0; h < 8; ++h) { + computeFullWidth(out_y, out_x, out_c, h); + } + } + + for (int h = 0; h < 8; ++h) { + computePartialWidth(out_y, out_c, h); + } + } + + int out_y = o_height - 1; + for (int h = 0; h < out_height % 8; ++h) { + for (int out_act_x = 0; out_act_x < out_width / 4; ++out_act_x) { + int out_x = out_act_x; + computeFullWidth(out_y, out_x, out_c, h); + } + computePartialWidth(out_y, out_c, h); + } + } +} +} // namespace hexagon +} // namespace runtime +} // namespace tvm + +int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue* out_val, + int out_code, void* res_handle) { + namespace hexagonrt = tvm::runtime::hexagon; + ICHECK_EQ(num_args, 7) << "Unexpected number of arguments"; + ICHECK_EQ(type_codes[0], kTVMDLTensorHandle) + << "First argument is expected to be the input tensor"; // Input activations + ICHECK_EQ(type_codes[1], kTVMDLTensorHandle) + << "Second argument is expected to be the weights tensor"; // Weights + ICHECK_EQ(type_codes[2], kDLInt) + << "Third argument is expected to be the pad_top offset"; // pad_top offset + ICHECK_EQ(type_codes[3], kDLInt) + << "Fourth argument is expected to be the pad_left offset"; // pad_left offset + ICHECK_EQ(type_codes[4], kDLInt) << "Fifth argument is expected to be the stride_h"; // stride_h + ICHECK_EQ(type_codes[5], kDLInt) << "Sixth argument is expected to be the stride_w"; // stride_w + ICHECK_EQ(type_codes[6], kTVMDLTensorHandle) + << "Seventh argument is expected to be the output tensor"; // output + + auto* act_flat = static_cast(args[0].v_handle); + auto* wgt_flat = static_cast(args[1].v_handle); + auto* out_flat = static_cast(args[6].v_handle); + + // Temporary assertion until multiple batches are supported + ICHECK_EQ(act_flat->shape[0], 1) << "Input batch size more than 1 is not supported yet"; + + // Temporary assertion until multiple batches are supported + ICHECK_EQ(out_flat->shape[0], 1) << "Output batch size more than 1 is not supported yet"; + + int pad_top = args[2].v_int64; + int pad_left = args[3].v_int64; + int stride_h = args[4].v_int64; + int stride_w = args[5].v_int64; + + LOG_INFO << "act.shape=" << act_flat->shape[0] << "x" << act_flat->shape[1] << "x" + << act_flat->shape[2] << "x" << act_flat->shape[3] + << ", wgt.shape=" << wgt_flat->shape[0] << "x" << wgt_flat->shape[1] << "x" + << wgt_flat->shape[2] << "x" << wgt_flat->shape[3] << ", pad_top=" << pad_top + << ", pad_left=" << pad_left; + + auto* device_api = tvm::runtime::DeviceAPI::Get(hexagonrt::hexagon_device, false); + ICHECK(device_api != nullptr); + tvm::runtime::String vtcm_scope = "global.vtcm"; + + auto act_vtcm = hexagonrt::prepare_nhwc(device_api, act_flat, /*copy_data=*/true); + + ICHECK_NE(wgt_flat->shape[0], 0) << "Weights height should not be zero"; + ICHECK_NE(wgt_flat->shape[1], 0) << "Weights width should not be zero"; + ICHECK_NE(wgt_flat->shape[2], 0) << "Weights input channels should not be zero"; + ICHECK_NE(wgt_flat->shape[3], 0) << "Weights output channels should not be zero"; + int num_wgt_chunks = hexagonrt::calculate_num_weight_chunks(wgt_flat->shape); + LOG_INFO << "num_wgt_chunks: " << num_wgt_chunks; + auto wgt_ptr_table = + reinterpret_cast(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t))); + auto wgt_vtcm = hexagonrt::prepare_hwio(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table); + + auto out_vtcm = hexagonrt::prepare_nhwc(device_api, out_flat, /*copy_data=*/false); + + // Prepare zero_block + int64_t block_nbytes = 2048; + void* zero_block = device_api->AllocDataSpace(hexagonrt::hexagon_device, 1, &block_nbytes, + tvm::runtime::DataType::UInt(8), vtcm_scope); + memset(zero_block, 0, 2048); + + // FIXME: Setting bias to zero_block: this works for up to 256 output channels. + auto bias_flat = + hexagonrt::SDLTensor<1>(zero_block, wgt_flat->dtype, zero_block, &wgt_flat->shape[3]); + auto act_shape = hexagonrt::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape); + auto filt_shape = hexagonrt::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape); + auto pad_shape = hexagonrt::SDLTensor<2>(nullptr, act_flat->dtype, nullptr, {pad_top, pad_left}); + auto out_shape = hexagonrt::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape); + bool relu = false; + + hexagonrt::conv_layer_fp16_hvx(out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape, bias_flat, + filt_shape, pad_shape, relu, stride_h, stride_w, + hexagonrt::to_uint(zero_block)); + + hexagonrt::deblockize_hwc_16b(out_flat->data, out_vtcm.data, out_flat->shape[1], + out_flat->shape[2], out_flat->shape[3]); + + device_api->FreeDataSpace(hexagonrt::hexagon_device, zero_block); + hexagonrt::release(device_api, out_vtcm); + hexagonrt::release(device_api, wgt_vtcm); + hexagonrt::release(device_api, act_vtcm); + + return 0; +} diff --git a/src/runtime/hexagon/ops/conv_utils.cc b/src/runtime/hexagon/ops/conv_utils.cc new file mode 100644 index 000000000000..e1ec1e17277d --- /dev/null +++ b/src/runtime/hexagon/ops/conv_utils.cc @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "tvm/runtime/hexagon/ops/conv2d.h" + +namespace tvm { +namespace runtime { +namespace hexagon { + +/** + * @brief Function to "blockize" the flat input data + * The term "blockize" is used to mention that the data is stored in non-contiguous blocks + * + * The input is mapped into the below mentioned layout (notation similar to index map used for + * transform layout): + * + * lambda n, h, w, c: n, h//8, w//4, c//32, AXIS_SEPARATOR, h%8, (w%4)//2, c%32, w%2 + * + * where AXIS_SEPARATOR represents split up in the physical layout + * + * @param out Pre-allocated output memory pointer + * @param inp_flat Flat input data pointer + * @param height + * @param width + * @param depth + */ +void blockize_hwc_16b(void* out, void* inp_flat, int height, int width, int depth) { + auto inp_data = static_cast(inp_flat); + auto out_data = static_cast(out); + const int stride_x = depth; + const int stride_y = stride_x * width; + + for (int cy = 0; cy < height; cy += 8) { + for (int cx = 0; cx < width; cx += 4) { + for (int cc = 0; cc < depth; cc += 32) { + auto block = reinterpret_cast(*out_data++); + int max_y = std::min(8, height - cy); + int max_x = std::min(4, width - cx); + int max_c = std::min(32, depth - cc); + for (int y = 0; y < max_y; ++y) { + for (int x = 0; x < max_x; ++x) { + for (int c = 0; c < max_c; ++c) { + block[xyc_to_sm_16b(y, x, c)] = + inp_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)]; + } + for (int c = max_c; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0; + } + for (int x = max_x; x < 4; ++x) { + for (int c = 0; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0; + } + } + + for (int y = max_y; y < 8; ++y) + for (int x = 0; x < 4; ++x) + for (int c = 0; c < 32; ++c) block[xyc_to_sm_16b(y, x, c)] = 0; + } // cc + } // cx + } // cy +} + +/** + * @brief Convert back from non-contguous layout to a flat layout + * + * @param out_flat Pre-allocated output memory pointer + * @param inp Blockized input data pointer + * @param height + * @param width + * @param depth + */ +void deblockize_hwc_16b(void* out_flat, void* inp, int height, int width, int depth) { + uintptr_t* inp_data = static_cast(inp); + uint16_t* out_data = static_cast(out_flat); + const int stride_x = depth; + const int stride_y = stride_x * width; + + for (int cy = 0; cy < height; cy += 8) { + for (int cx = 0; cx < width; cx += 4) { + for (int cc = 0; cc < depth; cc += 32) { + auto block = reinterpret_cast(*inp_data); + int max_y = std::min(8, height - cy); + int max_x = std::min(4, width - cx); + int max_c = std::min(32, depth - cc); + for (int y = 0; y < max_y; ++y) { + for (int x = 0; x < max_x; ++x) { + for (int c = 0; c < max_c; ++c) { + out_data[(cy + y) * stride_y + (cx + x) * stride_x + (cc + c)] = + block[xyc_to_sm_16b(y, x, c)]; + } + } + } + + inp_data++; + } + } + } +} + +/** + * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below: + * + * Weights are packed into the below mentioned layout (notation similar to index map): + * Since weights cannot be exactly represented into a index map notation, the + * base split up is mentioned below with a few gotchas + * + * lambda h, w, i, o: h//8, w//4, o//32, i//32, h%8, w%4, (i%32)//2, o%32, i%2 + * + * The gotchas are: + * - (w%4) is actually stored in the right to left order, as in 3,2,1,0 instead of 0,1,2,3 + * - The h%8 and (w%4) dimensions are not padded up, leading to chunks of different sizes + * (thereby the name "chunked" instead of packed) + * - The thinnest chunk of width is stored first. For example, if a kernel is 5x5, the first + * chunk along the width has size 1 (representing index 0) and then next one has size 4 + * representing indices (1,2,3,4) + * + * @param out_ptr Base pointer table to be filled with the list of pointers to the first addresses + * of the "chunked" weights + * @param out_ptr_size The number of chunks + * @param out Pointer to pre-allocated output memory + * @param inp Pointer to flat input data + * @param height + * @param width + * @param idepth + * @param odepth + */ +void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, + int width, int idepth, int odepth) { + auto inp_data = static_cast(inp); + auto out_data = static_cast(out); + const int stride_i = odepth; + const int stride_x = stride_i * idepth; + const int stride_y = stride_x * width; + + for (int cy = 0; cy < height; cy += 8) { + // In the chunkified tensor, the chunks are ordered in increasing + // x order, but they start from the thin one. + for (int cx = width - round_up(width, 4); cx < width; cx += 4) { + int cx0 = std::max(0, cx); + for (int ci = 0; ci < idepth; ci += 32) { + for (int co = 0; co < odepth; co += 32) { + int max_y = std::min(8, height - cy); + int max_x = std::min(4, cx + 4 - cx0); + int max_i = std::min(32, idepth - ci); + int max_o = std::min(32, odepth - co); + + auto chunk = reinterpret_cast(out_data); + for (int y = 0; y < max_y; ++y) { + for (int x = max_x - 1; x >= 0; --x) { + for (int i = 0; i < max_i; ++i) { + for (int o = 0; o < max_o; ++o) { + chunk[hwio_to_sm_16b(max_x, y, x, i, o)] = + inp_data[(cy + y) * stride_y + (cx0 + x) * stride_x + (ci + i) * stride_i + + (co + o)]; + } + for (int o = max_o; o < 32; ++o) chunk[hwio_to_sm_16b(max_x, y, x, i, o)] = 0; + } + for (int i = max_i; i < 32; ++i) + for (int o = 0; o < 32; ++o) chunk[hwio_to_sm_16b(max_x, y, x, i, o)] = 0; + } + } + + *out_ptr++ = chunk; + out_data += max_y * max_x * 32 * 32; + out_ptr_size--; + assert(out_ptr_size >= 0); + } + } + } + } +} + +SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat, + bool copy_data) { + tvm::runtime::String vtcm_scope = "global.vtcm"; + + // Allocate blocks for activations. We will use the block pointers + // directly from the allocated area. + int n = nhwc_flat->shape[0]; + int h = round_up(nhwc_flat->shape[1], 8); + int w = round_up(nhwc_flat->shape[2], 4); + int c = round_up(nhwc_flat->shape[3], 32); + int64_t shape_2d[2] = {(n * h * w * c) / (8 * 4 * 32), 8 * 4 * 32}; + void* nhwc_vtcm = + device_api->AllocDataSpace(hexagon_device, 2, shape_2d, nhwc_flat->dtype, vtcm_scope); + if (copy_data) { + blockize_hwc_16b(nhwc_vtcm, nhwc_flat->data, nhwc_flat->shape[1], nhwc_flat->shape[2], + nhwc_flat->shape[3]); + } + + return SDLTensor<4>(nhwc_vtcm, nhwc_flat->dtype, nhwc_vtcm, {n, h / 8, w / 4, c / 32}); +} + +SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat, + int num_chunks, void** ptr_table) { + tvm::runtime::String vtcm_scope = "global.vtcm"; + + // Allocate one block for filter data. We will need to create our own + // pointer table. The reason is that filter chunks cannot be padded + // height- or width-wise, so filter chunks may have different sizes. + // A filter chunk is a block of size HxWx32x32, where H, W are at most + // height and width of a block respectively. + int h = hwio_flat->shape[0]; + int w = hwio_flat->shape[1]; + int i = round_up(hwio_flat->shape[2], 32); + int o = round_up(hwio_flat->shape[3], 32); + int64_t shape_1d[] = {h * w * i * o}; + void* hwio_vtcm = + device_api->AllocDataSpace(hexagon_device, 1, shape_1d, hwio_flat->dtype, vtcm_scope); + + chunkify_hwio_16b(ptr_table, num_chunks, hwio_vtcm, hwio_flat->data, hwio_flat->shape[0], + hwio_flat->shape[1], hwio_flat->shape[2], hwio_flat->shape[3]); + + return SDLTensor<4>(ptr_table, hwio_flat->dtype, hwio_vtcm, + {round_up(h, 8) / 8, round_up(w, 4) / 4, i / 32, o / 32}); +} + +int calculate_num_weight_chunks(int64_t* shape_hwio) { + int h = round_up(shape_hwio[0], 8); + int w = round_up(shape_hwio[1], 4); + int i = round_up(shape_hwio[2], 32); + int o = round_up(shape_hwio[3], 32); + + return (h * w * i * o) / (8 * 4 * 32 * 32); +} + +} // namespace hexagon +} // namespace runtime +} // namespace tvm diff --git a/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc b/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc new file mode 100644 index 000000000000..3b922fa6c2a8 --- /dev/null +++ b/tests/cpp-runtime/hexagon/hexagon_fp16_utils_tests.cc @@ -0,0 +1,289 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "tvm/runtime/hexagon/ops/conv2d.h" + +using namespace tvm::runtime::hexagon; + +class HexagonUtilsTest : public ::testing::Test { + public: + void SetUp() override { + vtcm_scope = "global.vtcm"; + device_api = tvm::runtime::DeviceAPI::Get(hexagon_device, false); + float16.code = kDLFloat; + float16.bits = 16; + float16.lanes = 1; + } + + void setupTensor(std::tuple shape) { + auto [s1, s2, s3, s4] = shape; + tensor_shape[0] = s1; + tensor_shape[1] = s2; + tensor_shape[2] = s3; + tensor_shape[3] = s4; + int64_t shape_1d[1] = {s1 * s2 * s3 * s4}; + + flat_mem = device_api->AllocDataSpace(hexagon_device, 1, shape_1d, float16, vtcm_scope); + flat_mem_data = static_cast(flat_mem); + fill_vals(flat_mem_data, shape_1d[0]); + + flat_tensor.data = flat_mem; + flat_tensor.device = hexagon_device; + flat_tensor.ndim = 4; + flat_tensor.dtype = float16; + flat_tensor.shape = tensor_shape; + flat_tensor.strides = nullptr; + flat_tensor.byte_offset = 0; + } + + void TearDownTensor() { + if (flat_tensor.data) device_api->FreeDataSpace(hexagon_device, flat_mem); + } + + static void fill_vals(uint16_t* arr, int size) { + // Testing with uint16 instead of float16 as generating random float16 is not easy within c++ + uint16_t max = UINT16_MAX; + srand(std::time(0)); + for (int i = 0; i < size; ++i) { + arr[i] = static_cast(std::rand() % max); + } + } + + static int flattened_idx(int nn, int hh, int ww, int cc, int64_t* shape) { + int h = shape[1]; + int w = shape[2]; + int c = shape[3]; + return cc + c * (ww + w * (hh + h * (nn))); + } + + DLTensor flat_tensor; + void* flat_mem; + uint16_t* flat_mem_data; + tvm::runtime::DeviceAPI* device_api; + tvm::runtime::String vtcm_scope; + DLDataType float16; + int64_t tensor_shape[4]; +}; + +// Parameterized test fixture with 4 params representing n, h, w, c +class HexagonUtilsActivationsBlockizeTest + : public HexagonUtilsTest, + public ::testing::WithParamInterface, std::tuple>> {}; + +// TODO (quic-sanirudh): See if we can test with random generated indices +INSTANTIATE_TEST_SUITE_P( + BlockizeDeblockizeTestFixtures, HexagonUtilsActivationsBlockizeTest, + ::testing::Combine(::testing::Values(std::make_tuple(1, 14, 7, 60)), + ::testing::Values(std::make_tuple(0, 0, 0, 0), // first element + std::make_tuple(0, 7, 3, 31), // last element + // Remaining are random element tests + std::make_tuple(0, 13, 6, 59), + std::make_tuple(0, 0, 0, 32), std::make_tuple(0, 0, 4, 32), + std::make_tuple(0, 2, 3, 4), std::make_tuple(0, 5, 6, 7), + std::make_tuple(0, 10, 4, 12))), + [](const ::testing::TestParamInfo& info) { + // Can use info.param here to generate the test suffix + auto indices = std::get<1>(info.param); + int h = std::get<1>(indices); + int w = std::get<2>(indices); + int c = std::get<3>(indices); + // Generate test name as "hwc0x0x0" if the indices of hwc are 0,0,0 + std::string name = + "hwc" + std::to_string(h) + "x" + std::to_string(w) + "x" + std::to_string(c); + return name; + }); + +TEST_F(HexagonUtilsActivationsBlockizeTest, prepare_nhwc) { + auto shape = std::make_tuple(1, 14, 7, 60); + auto [n, h, w, c] = shape; + setupTensor(shape); + + // // copy_data is set to false here as there's a separate test for blockize when copy_data + // becomes true + auto blocked_tensor = prepare_nhwc(device_api, &flat_tensor, /*copy_data=*/false); + + EXPECT_EQ(blocked_tensor.shape[0], n); + EXPECT_EQ(blocked_tensor.shape[1], round_up(h, 8) / 8); + EXPECT_EQ(blocked_tensor.shape[2], round_up(w, 4) / 4); + EXPECT_EQ(blocked_tensor.shape[3], round_up(c, 32) / 32); + + TearDownTensor(); + release(device_api, blocked_tensor); +} + +TEST_P(HexagonUtilsActivationsBlockizeTest, blockize_hwc_16b) { + auto shape_tuple = std::get<0>(GetParam()); + setupTensor(shape_tuple); + auto [n, h, w, c] = shape_tuple; + int64_t shape[] = {n, h, w, c}; + + int h_rounded = round_up(h, 8); + int w_rounded = round_up(w, 4); + int c_rounded = round_up(c, 32); + int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 4 * 32), 8 * 4 * 32}; + + void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, float16, vtcm_scope); + int64_t blocked_shape[] = {n, h_rounded / 8, w_rounded / 4, c_rounded / 32}; + blockize_hwc_16b(blocked_mem, flat_mem, h, w, c); + + std::function flatten = + HexagonUtilsActivationsBlockizeTest::flattened_idx; + + auto getBlockedElem = [&blocked_shape, blocked_mem, flatten](int nn, int hh, int ww, int cc) { + auto* blocks = static_cast(blocked_mem); + int blockIdx = flatten(nn, hh / 8, ww / 4, cc / 32, blocked_shape); + uint16_t* block = reinterpret_cast(blocks[blockIdx]); + return block[xyc_to_sm_16b(hh % 8, ww % 4, cc % 32)]; + }; + + auto [nn, hh, ww, cc] = std::get<1>(GetParam()); + + EXPECT_EQ(flat_mem_data[flattened_idx(nn, hh, ww, cc, shape)], getBlockedElem(nn, hh, ww, cc)); + + TearDownTensor(); + device_api->FreeDataSpace(hexagon_device, blocked_mem); +} + +TEST_P(HexagonUtilsActivationsBlockizeTest, deblockize_hwc_16b) { + auto shape_tuple = std::get<0>(GetParam()); + setupTensor(shape_tuple); + auto [n, h, w, c] = shape_tuple; + int64_t shape[] = {n, h, w, c}; + int64_t shape_1d[1] = {n * h * w * c}; + + int h_rounded = round_up(h, 8); + int w_rounded = round_up(w, 4); + int c_rounded = round_up(c, 32); + int64_t shape_2d[2] = {(n * h_rounded * w_rounded * c_rounded) / (8 * 4 * 32), 8 * 4 * 32}; + + void* blocked_mem = device_api->AllocDataSpace(hexagon_device, 2, shape_2d, float16, vtcm_scope); + blockize_hwc_16b(blocked_mem, flat_mem, h, w, c); + + void* deblocked_flat_mem = + device_api->AllocDataSpace(hexagon_device, 1, shape_1d, float16, vtcm_scope); + deblockize_hwc_16b(deblocked_flat_mem, blocked_mem, h, w, c); + auto* deblocked_flat_mem_data = static_cast(deblocked_flat_mem); + + auto [nn, hh, ww, cc] = std::get<1>(GetParam()); + + auto idx = flattened_idx(nn, hh, ww, cc, shape); + EXPECT_EQ(flat_mem_data[idx], deblocked_flat_mem_data[idx]); + + TearDownTensor(); + device_api->FreeDataSpace(hexagon_device, blocked_mem); + device_api->FreeDataSpace(hexagon_device, deblocked_flat_mem); +} + +class HexagonUtilsWeightsChunkifyTest + : public HexagonUtilsTest, + public ::testing::WithParamInterface, std::tuple>> {}; + +INSTANTIATE_TEST_SUITE_P( + ChunkifyDechunkifyTests, HexagonUtilsWeightsChunkifyTest, + ::testing::Combine(::testing::Values(std::make_tuple(3, 3, 40, 40)), + ::testing::Values(std::make_tuple(0, 0, 0, 0), // first element + std::make_tuple(2, 2, 39, 39), // Last element + // Remaining are random element tests + std::make_tuple(1, 1, 28, 33), + std::make_tuple(1, 2, 8, 38), + std::make_tuple(1, 0, 12, 15), + std::make_tuple(2, 1, 9, 22), std::make_tuple(0, 2, 6, 7), + std::make_tuple(1, 2, 3, 4))), + [](const ::testing::TestParamInfo& info) { + // Can use info.param here to generate the test suffix + auto indices = std::get<1>(info.param); + int h = std::get<0>(indices); + int w = std::get<1>(indices); + int i = std::get<2>(indices); + int o = std::get<3>(indices); + // Generate test name as "hwc0x0x0" if the indices of hwc are 0,0,0 + std::string name = "hwio" + std::to_string(h) + std::to_string(w) + "x" + std::to_string(i) + + "x" + std::to_string(o); + return name; + }); + +TEST_F(HexagonUtilsWeightsChunkifyTest, calculate_num_weight_chunks) { + int64_t shape[] = {3, 3, 40, 40}; + int num_wgt_chunks = calculate_num_weight_chunks(shape); + EXPECT_EQ(num_wgt_chunks, 4); +} + +TEST_F(HexagonUtilsWeightsChunkifyTest, prepare_hwio) { + int64_t shape[] = {3, 3, 40, 40}; + auto [h, w, i, o] = shape; + auto shape_tuple = std::make_tuple(h, w, i, o); + setupTensor(shape_tuple); + + // copy_data is set to false here as there's a separate test for blockize when copy_data becomes + // true + auto num_wgt_chunks = calculate_num_weight_chunks(shape); + auto wgt_ptr_table = + reinterpret_cast(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t))); + auto chunked_tensor = prepare_hwio(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table); + + EXPECT_EQ(chunked_tensor.shape[0], round_up(h, 8) / 8); + EXPECT_EQ(chunked_tensor.shape[1], round_up(w, 4) / 4); + EXPECT_EQ(chunked_tensor.shape[2], round_up(i, 32) / 32); + EXPECT_EQ(chunked_tensor.shape[3], round_up(o, 32) / 32); + + release(device_api, chunked_tensor); + TearDownTensor(); +} + +TEST_P(HexagonUtilsWeightsChunkifyTest, chunkify_hwio_16b) { + auto [shape_tuple, indices] = GetParam(); + auto [h, w, i, o] = shape_tuple; + setupTensor(shape_tuple); + int64_t shape[] = {h, w, i, o}; + + auto num_wgt_chunks = calculate_num_weight_chunks(shape); + auto wgt_ptr_table = + reinterpret_cast(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t))); + auto chunked_tensor = prepare_hwio(device_api, &flat_tensor, num_wgt_chunks, wgt_ptr_table); + + int rd = w - (w % 4); // round down by 4 for width + int thin_w = w - rd; + + auto getChunkedElem = [thin_w, chunked_tensor](int hh, int ww, int ii, int oo) { + int fcw = 0; + if (ww >= thin_w) { + fcw = (ww - thin_w) / 4 + 1; + ww = (ww - thin_w) % 4; + } + auto chunk = hwio_at(chunked_tensor, hh / 8, fcw, ii / 32, oo / 32); + auto chunk_uint16 = reinterpret_cast(chunk); + return chunk_uint16[hwio_to_sm_16b(thin_w, hh % 8, ww, ii % 32, oo % 32)]; + }; + + auto [hh, ww, ii, oo] = indices; + + EXPECT_EQ(flat_mem_data[flattened_idx(hh, ww, ii, oo, shape)], getChunkedElem(hh, ww, ii, oo)); + release(device_api, chunked_tensor); +} diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py new file mode 100644 index 000000000000..e8efdb369590 --- /dev/null +++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_fp16_intrin.py @@ -0,0 +1,248 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" Test conv2d HVX intrinsic implementation""" + +import numpy as np + +import tvm +import tvm.contrib.hexagon +from tvm.topi.testing import conv2d_nhwc_python + + +def build_conv2d(target): + """Build and the return the conv2d module that calls the intrinsic implementation""" + act_n, act_h, act_w, act_c = ( + tvm.te.var("act_n"), + tvm.te.var("act_h"), + tvm.te.var("act_w"), + tvm.te.var("act_c"), + ) + filt_h, filt_w, filt_o = tvm.te.var("filt_h"), tvm.te.var("fw"), tvm.te.var("filt_o") + off_l, off_t = tvm.te.var("off_l"), tvm.te.var("off_t") + stride_h, stride_w = tvm.te.var("stride_h"), tvm.te.var("stride_w") + + act_flat = tvm.te.placeholder( + shape=(act_n, act_h, act_w, act_c), dtype="float16", name="act_flat" + ) + wgt_flat = tvm.te.placeholder( + shape=(filt_h, filt_w, act_c, filt_o), dtype="float16", name="wgt_flat" + ) + + out_flat = tvm.te.extern( + shape=(act_n, (act_h - filt_h) // stride_h + 1, (act_w - filt_w) // stride_w + 1, filt_o), + inputs=[act_flat, wgt_flat], + fcompute=lambda ins, outs: tvm.tir.call_cpacked( + "conv2d_packed_fp16", # Function from TVM runtime + ins[0], + ins[1], + off_t, + off_l, + stride_h, + stride_w, + outs[0], + tvm.runtime.const(0), # resource_handle (unused) + ), + dtype="float16", + ) + + s = tvm.te.create_schedule(out_flat.op) + + func_name = "extern_conv" + with tvm.transform.PassContext(opt_level=3): + module = tvm.build( + s, + [act_flat, wgt_flat, off_t, off_l, stride_h, stride_w, out_flat], + target=target, + name=func_name, + ) + + return module + + +shape_parameters = [ + ( + (1, 8, 4, 3), + (3, 3, 3, 3), + (1, 1), + ), + ( + (1, 10, 14, 3), + (3, 3, 3, 3), + (1, 1), + ), + ( + (1, 14, 6, 3), + (3, 3, 3, 3), + (1, 1), + ), + ( + (1, 14, 6, 3), + (3, 3, 3, 64), + (1, 1), + ), + ( + (1, 14, 6, 3), + (5, 5, 3, 3), + (1, 1), + ), + ( + (1, 8, 8, 3), + (2, 2, 3, 3), + (1, 1), + ), + ( + (1, 14, 6, 64), + (3, 3, 64, 3), + (1, 1), + ), + ( + (1, 4, 4, 40), + (3, 3, 40, 3), + (1, 1), + ), + ( + (1, 4, 4, 3), + (3, 3, 3, 3), + (1, 1), + ), + ( + (1, 5, 5, 3), + (3, 3, 3, 3), + (1, 1), + ), + ( + (1, 6, 6, 3), + (3, 3, 3, 3), + (1, 1), + ), + ( + (1, 7, 7, 3), + (3, 3, 3, 3), + (1, 1), + ), + ( + (1, 8, 8, 3), + (3, 3, 3, 3), + (1, 1), + ), + ( + (1, 8, 8, 3), + (5, 5, 3, 3), + (1, 1), + ), + ( + (1, 8, 8, 64), + (2, 2, 64, 64), + (1, 1), + ), + ( + (1, 8, 4, 3), + (3, 3, 3, 3), + (2, 2), + ), + ( + (1, 14, 6, 3), + (3, 3, 3, 64), + (2, 2), + ), + ( + (1, 14, 6, 3), + (5, 5, 3, 3), + (2, 2), + ), + ( + (1, 8, 8, 3), + (2, 2, 3, 3), + (2, 2), + ), +] + + +def gen_config(params): + """Utility function to generate useful ids for shape_parameters""" + + dims = lambda vals: "x".join(map(str, vals)) + + config = {} + for param in params: + act_shape, wgt_shape, inp_stride = param + name = f"nhwc{dims(act_shape)}-hwio{dims(wgt_shape)}-stride{dims(inp_stride)}" + config[name] = param + + return config + + +class TestConv2dIntrin: + """Test Conv2d Intrin class""" + + config = gen_config(shape_parameters) + act_shape, wgt_shape, inp_stride = tvm.testing.parameters(*config.values(), ids=config.keys()) + inp_offset = tvm.testing.parameter((0, 0), ids=["offset0x0"]) + + @tvm.testing.requires_hexagon + def test_conv2d(self, act_shape, wgt_shape, inp_stride, inp_offset, hexagon_session): + """Test conv2d intrinsic implementation""" + assert act_shape[3] == wgt_shape[2] + + target_hexagon = tvm.target.hexagon("v69") + target = tvm.target.Target(target_hexagon, host=target_hexagon) + + # Currently, input offset does not affect the output shape + def get_out_shape(ash, wsh, inp_stride): + assert ash[3] == wsh[2] + osh = ( + ash[0], + (ash[1] - wsh[0]) // inp_stride[0] + 1, + (ash[2] - wsh[1]) // inp_stride[1] + 1, + wsh[3], + ) + assert tvm.tir.all([x > 0 for x in osh]) + return osh + + act = np.random.rand(*act_shape).astype("float16") + wgt = np.random.rand(*wgt_shape).astype("float16") + + module = build_conv2d(target) + + mod = hexagon_session.load_module(module) + output = tvm.nd.array( + np.zeros(get_out_shape(act_shape, wgt_shape, inp_stride), dtype="float16"), + device=hexagon_session.device, + ) + mod( + tvm.nd.array(act, device=hexagon_session.device), + tvm.nd.array(wgt, device=hexagon_session.device), + inp_offset[0], # off_t + inp_offset[1], # off_l + inp_stride[0], # stride_height + inp_stride[1], # stride_width + output, + ) + + out = output.numpy() + + # Generate reference output and compare: + ref_out = conv2d_nhwc_python( + act.astype("float32"), wgt.astype("float32"), stride=inp_stride, padding="VALID" + ).astype("float16") + + tvm.testing.assert_allclose(out, ref_out, rtol=5e-2, atol=5e-2) + + +if __name__ == "__main__": + tvm.testing.main() From 12223983422868bbbc5444f66d175aeb9318b71f Mon Sep 17 00:00:00 2001 From: Dhruv Chauhan <89972057+dchauhan-arm@users.noreply.github.com> Date: Mon, 12 Sep 2022 21:03:56 +0100 Subject: [PATCH 152/704] [TFLite] Support quantized GREATER op in TFLite frontend (#12754) Support GREATER quantization operation conversion as part of issue #9187 Continuation of #11519. --- python/tvm/relay/frontend/tflite.py | 19 ++++---- tests/python/frontend/tflite/test_forward.py | 49 +++++++++++--------- 2 files changed, 37 insertions(+), 31 deletions(-) diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py index c38191b389c9..6c68230e0ecc 100644 --- a/python/tvm/relay/frontend/tflite.py +++ b/python/tvm/relay/frontend/tflite.py @@ -1291,7 +1291,13 @@ def convert_square(self, op): return out - def _convert_elemwise(self, relay_op, op, ignore_qnn_params=False): + def _convert_elemwise( + self, + relay_op, + op, + ignore_qnn_params=False, + comparison_op=False, + ): """Generic method to Convert TFLite elemwise""" try: from tflite.AddOptions import AddOptions @@ -1316,7 +1322,7 @@ def _convert_elemwise(self, relay_op, op, ignore_qnn_params=False): # TFLite format demands equal scale and zero_point tuple parameters for some operations # to allow us to use non-quantized operation instead of quantized if ignore_qnn_params=True - if ignore_qnn_params: + if ignore_qnn_params and not comparison_op: assert ( lhs_tensor.qnn_params and self.has_same_qnn_params(lhs_tensor, output_tensor) @@ -1431,12 +1437,7 @@ def convert_minimum(self, op): def convert_greater(self, op): """Convert TFLite GREATER""" - # Check if the input tensor is quantized, call QNN op - if self.is_quantized(op): - raise tvm.error.OpNotImplemented( - "TFlite quantized GREATER operator is not supported yet." - ) - return self._convert_elemwise(_op.greater, op) + return self._convert_elemwise(_op.greater, op, self.is_quantized(op), comparison_op=True) def convert_squared_difference(self, op): """Convert TFLite SQUARED DIFFERENCE""" @@ -1475,7 +1476,7 @@ def convert_less_equal(self, op): def convert_equal(self, op): """Convert TFLite EQUAL""" - return self._convert_elemwise(_op.equal, op, self.is_quantized(op)) + return self._convert_elemwise(_op.equal, op, self.is_quantized(op), comparison_op=True) def convert_not_equal(self, op): """Convert TFLite NOT_EQUAL""" diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py index 7267b725483d..18045b8e8365 100644 --- a/tests/python/frontend/tflite/test_forward.py +++ b/tests/python/frontend/tflite/test_forward.py @@ -2254,6 +2254,7 @@ def _test_elemwise( quantized=False, qnn_op=None, same_qnn_params=False, + comparison_op=False, ): """One iteration of elemwise""" @@ -2298,7 +2299,7 @@ def __test_elemwise(in_data): if x[0] is not None } - if math_op is math_ops.equal: + if comparison_op: out = math_op(inq_data[0], inq_data[1]) out = with_fused_activation_function(out, fused_activation_function) @@ -2307,6 +2308,9 @@ def __test_elemwise(in_data): [x + ":0" for x in input_range.keys()], [x[1] for x in zip(in_data, inq_data) if x[0] is not None], [out], + quantized=True, + input_range=input_range, + experimental_new_converter=same_qnn_params, ) else: out = math_op(inq_data[0], inq_data[1]) @@ -2314,6 +2318,7 @@ def __test_elemwise(in_data): out = tf.quantization.fake_quant_with_min_max_args( out, min=out_min, max=out_max, name="out" ) + # Note same_qnn_params uses experimental_new_converter as toco failed compare_tflite_with_tvm( [x[1] for x in zip(in_data, data) if x[0] is not None], @@ -2440,9 +2445,17 @@ def _test_minimum(data, fused_activation_function=None, quantized=False, qnn_op= # ------- -def _test_greater(data): +def _test_greater(data, fused_activation_function=None, quantized=False, qnn_op=None): """One iteration of greater""" - return _test_elemwise(math_ops.greater, data) + return _test_elemwise( + math_ops.greater, + data, + fused_activation_function, + quantized, + qnn_op, + same_qnn_params=True, + comparison_op=True, + ) ####################################################################### @@ -2489,6 +2502,7 @@ def _test_equal(data, fused_activation_function=None, quantized=False, qnn_op=No quantized, qnn_op, same_qnn_params=True, + comparison_op=True, ) @@ -2555,25 +2569,14 @@ def _test_forward_elemwise(testop): def _test_forward_elemwise_quantized(testop): - if testop is not _test_equal: - testop( - [ - np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.uint8), - np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.uint8), - ], - quantized=True, - qnn_op=testop, - ) - else: - # no need for fake_quant to hold tensors in float32 until conversion - testop( - [ - np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.float32), - np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.float32), - ], - quantized=True, - qnn_op=testop, - ) + testop( + [ + np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.uint8), + np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.uint8), + ], + quantized=True, + qnn_op=testop, + ) def _test_elemwise_qnn_out_range(qnn_op): @@ -2585,6 +2588,7 @@ def _test_elemwise_qnn_out_range(qnn_op): _test_maximum: (-112, 111), _test_minimum: (-128, 127), _test_equal: (-150, 150), + _test_greater: (-150, 150), } return qnn_out_range[qnn_op] @@ -2615,6 +2619,7 @@ def test_all_elemwise(): _test_forward_elemwise(_test_minimum) _test_forward_elemwise_quantized(_test_minimum) _test_forward_elemwise(_test_greater) + _test_forward_elemwise_quantized(_test_greater) _test_forward_elemwise(_test_squared_difference) _test_forward_elemwise(_test_greater_equal) _test_forward_elemwise(_test_less) From 9671aee942503815ad2a586406eef11391287ee5 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Mon, 12 Sep 2022 15:31:52 -0500 Subject: [PATCH 153/704] [Hexagon] Validate 2-d physical shapes for TIR-derived schedules (#12662) Previously, the test cases only tested TE-based schedules. This commit runs the same tests for equivalent TIR-based schedules as well. This is intended to catch Hexagon-specific regressions, such as the one resolved in https://github.com/apache/tvm/pull/12652. --- .../test_hexagon/test_2d_physical_buffers.py | 59 ++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) mode change 100644 => 100755 tests/python/contrib/test_hexagon/test_2d_physical_buffers.py diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py old mode 100644 new mode 100755 index cebb36edc35d..cba6ddc4433a --- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py +++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py @@ -41,6 +41,8 @@ # there as well # pylint: disable=invalid-name +schedule_type = tvm.testing.parameter("TE", "TIR") + dtype = tvm.testing.parameter("int8") batch_size = tvm.testing.parameter( 16, @@ -198,6 +200,7 @@ def output_shape(self, input_shape): @tvm.testing.fixture def schedule_args( self, + schedule_type, input_shape, dtype, input_layout, @@ -206,12 +209,39 @@ def schedule_args( working_scope, ): """Create and return the schedule and input args after applying layout transform""" + if schedule_type == "TE": + + return self._te_schedule_args( + input_shape, dtype, input_layout, output_layout, working_layout, working_scope + ) + elif schedule_type == "TIR": + return self._tir_schedule_args( + input_shape, dtype, input_layout, output_layout, working_layout, working_scope + ) + + else: + raise ValueError(f"Unknown schedule type: {schedule_type}") + + def _te_tensors(self, input_shape, dtype): input_tensor = te.placeholder(input_shape, dtype, name="Input") output_tensor = te.compute( shape=input_tensor.shape, fcompute=lambda *indices: (2 * input_tensor[indices]).astype(dtype), name="Output", ) + return input_tensor, output_tensor + + def _te_schedule_args( + self, + input_shape, + dtype, + input_layout, + output_layout, + working_layout, + working_scope, + ): + input_tensor, output_tensor = self._te_tensors(input_shape, dtype) + schedule = te.create_schedule(output_tensor.op) write_cache = schedule.cache_write(output_tensor, working_scope) @@ -235,6 +265,33 @@ def apply_transform(tensor, layout): return [schedule, [input_tensor, output_tensor]] + def _tir_schedule_args( + self, input_shape, dtype, input_layout, output_layout, working_layout, working_scope + ): + tensors = self._te_tensors(input_shape, dtype) + + sch = tvm.tir.Schedule(te.create_prim_func(tensors)) + + cache_read_block = sch.cache_read("Output", 0, working_scope) + cache_write_block = sch.cache_write("Output", 0, working_scope) + + def apply_transform(block, buffer_name, layout): + if layout == "nhwc": + pass + elif layout == "nchw-8h8w32c-1d": + sch.transform_layout(block, buffer_name, layout_transform_1d) + elif layout == "nchw-8h8w32c-2d": + sch.transform_layout(block, buffer_name, layout_transform_2d) + else: + raise RuntimeError(f"Unexpected layout '{layout}'") + + apply_transform(cache_read_block, ("read", 0), input_layout) + apply_transform(cache_read_block, ("write", 0), working_layout) + apply_transform(cache_write_block, ("read", 0), working_layout) + apply_transform(cache_write_block, ("write", 0), output_layout) + + return [sch.mod] + @tvm.testing.fixture def ir_module(self, schedule_args): # If the two buffers are accessed with the same indices, CSE @@ -272,7 +329,7 @@ def test_cache_shape(self, ir_module, input_layout, working_layout, output_layou "Input.global.vtcm": working_layout, "Output.global.vtcm": working_layout, "Output": output_layout, - }[buffer.name] + }[buffer.name.replace("_", ".")] expected_physical_dimensions = { "nhwc": 1, From 4d2766409f1b95504aac171649367c2df2813029 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Mon, 12 Sep 2022 15:06:16 -0800 Subject: [PATCH 154/704] [AutoTVM] Fix `None` feature in AutoTVM tuning (#12760) This PR introduces a couple of fixes to make AutoTVM working more robustly: - Fixed a very rarecase that `None` could pop up in AutoTVM features; - Fixed a misuse of `ARGS` in the testing script; - Fixed the filename for caching. --- python/tvm/autotvm/testing/tune_relay.py | 13 +++++++------ python/tvm/autotvm/tuner/xgboost_cost_model.py | 7 +++---- python/tvm/meta_schedule/testing/relay_workload.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/python/tvm/autotvm/testing/tune_relay.py b/python/tvm/autotvm/testing/tune_relay.py index e4745963741f..743127ec1ded 100644 --- a/python/tvm/autotvm/testing/tune_relay.py +++ b/python/tvm/autotvm/testing/tune_relay.py @@ -139,12 +139,6 @@ def _parse_args(): tracker_key=parsed.rpc_key, session_timeout_sec=600, ) - if ARGS.target.kind.name != "llvm" and ARGS.graph_tuner: - raise ValueError("GraphTuner only supports llvm target") - if ARGS.target.kind.name != "llvm" and ARGS.cpu_flush: - raise ValueError("cpu_flush only supports llvm target") - if ARGS.target.kind.name == "llvm" and not ARGS.cpu_flush: - warnings.warn("cpu_flush is not enabled for llvm target") return parsed @@ -152,6 +146,13 @@ def _parse_args(): def main(): + if ARGS.target.kind.name != "llvm" and ARGS.graph_tuner: + raise ValueError("GraphTuner only supports llvm target") + if ARGS.target.kind.name != "llvm" and ARGS.cpu_flush: + raise ValueError("cpu_flush only supports llvm target") + if ARGS.target.kind.name == "llvm" and not ARGS.cpu_flush: + warnings.warn("cpu_flush is not enabled for llvm target") + log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json") graph_opt_sch_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}_graph_opt.log") measure_option = autotvm.measure_option( diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py index d4942ce6a4ca..6fa04f336f10 100644 --- a/python/tvm/autotvm/tuner/xgboost_cost_model.py +++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py @@ -21,12 +21,11 @@ import time import numpy as np - from tvm.contrib.popen_pool import PopenPoolExecutor, StatusKind from .. import feature from ..utils import get_rank -from .metric import max_curve, recall_curve, cover_curve +from .metric import cover_curve, max_curve, recall_curve from .model_based_tuner import CostModel, FeatureCache xgb = None @@ -346,7 +345,7 @@ def _get_feature(self, indexes): ret = np.empty((len(indexes), feature_len), dtype=np.float32) for i, ii in enumerate(indexes): t = fea_cache[ii] - if t.shape[0] < feature_len: + if t is not None and t.shape[0] < feature_len: t = np.pad(t, (0, feature_len - t.shape[0])) ret[i, :] = t if t is not None else 0 return ret @@ -449,8 +448,8 @@ def custom_callback( ): """callback function for xgboost to support multiple custom evaluation functions""" # pylint: disable=import-outside-toplevel - from xgboost.core import EarlyStopException from xgboost.callback import _fmt_metric + from xgboost.core import EarlyStopException try: from xgboost.training import aggcv diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py index f4f6336df33f..98bb99512020 100644 --- a/python/tvm/meta_schedule/testing/relay_workload.py +++ b/python/tvm/meta_schedule/testing/relay_workload.py @@ -230,7 +230,7 @@ def get_network( inputs: Tuple[str, List[int], str] params_bytearray: bytearray - filename = f'relay-{name}-{",".join(str(i) for i in input_shape)}.json' + filename = f'relay-{name}-{layout}-{",".join(str(i) for i in input_shape)}.json' cached = _load_cache(cache_dir, filename) if cached is None: with multiprocessing.Pool(processes=1) as pool: From a23b71ce1e3011be6b8e6ca5162b023956358911 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Mon, 12 Sep 2022 15:42:40 -0800 Subject: [PATCH 155/704] [MetaSchedule][Test] Migrate AddRFactor to SEqual (#12758) This PR migrates the usage of `check_trace` to `check_sketch`, which prefers structural equality of TIRs insteda of string equalty of traces. --- .../meta_schedule/testing/schedule_rule.py | 16 +- python/tvm/tir/schedule/testing.py | 8 +- .../schedule_rule/add_rfactor.cc | 5 +- src/tir/schedule/primitive/sampling.cc | 4 +- ...meta_schedule_schedule_rule_add_rfactor.py | 142 ++++++++++++------ 5 files changed, 109 insertions(+), 66 deletions(-) diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py index 46df4b95ce07..b08db0811dd3 100644 --- a/python/tvm/meta_schedule/testing/schedule_rule.py +++ b/python/tvm/meta_schedule/testing/schedule_rule.py @@ -18,7 +18,6 @@ from typing import List, Union from tvm.meta_schedule.schedule_rule import ( - AddRFactor, AutoBind, AutoInline, CrossThreadReduction, @@ -28,7 +27,9 @@ ReuseType, ScheduleRule, ) -from tvm.meta_schedule.schedule_rule.multi_level_tiling import MultiLevelTilingTensorCore +from tvm.meta_schedule.schedule_rule.multi_level_tiling import ( + MultiLevelTilingTensorCore, +) from tvm.target import Target @@ -64,13 +65,6 @@ def auto_inline(target: Target) -> ScheduleRule: raise NotImplementedError(f"{target.kind.name} is not supported") -def add_rfactor(target: Target) -> ScheduleRule: - """Default schedule rules for with add_rfactor""" - if target.kind.name == "llvm": - return AddRFactor(max_jobs_per_core=16, max_innermost_factor=64) - raise NotImplementedError(f"{target.kind.name} is not supported") - - def cross_thread_reduction(target: Target) -> ScheduleRule: """Default schedule rules for with cross-thread reduction""" if target.kind.name == "cuda": @@ -131,7 +125,9 @@ def multi_level_tiling_tensor_core( trans_b = [trans_b] if target.kind.name == "cuda": - from tvm.tir.tensor_intrin import cuda # pylint: disable=import-outside-toplevel + from tvm.tir.tensor_intrin import ( # pylint: disable=import-outside-toplevel + cuda, + ) intrin_groups = [ cuda.get_wmma_intrin_group(write_reuse_scope, _in_dtype, _out_dtype, _trans_b) diff --git a/python/tvm/tir/schedule/testing.py b/python/tvm/tir/schedule/testing.py index 3689f756e83c..538cc6e143ee 100644 --- a/python/tvm/tir/schedule/testing.py +++ b/python/tvm/tir/schedule/testing.py @@ -15,12 +15,12 @@ # specific language governing permissions and limitations # under the License. """Testing utilities for the TensorIR schedule API""" -from typing import Union, Sequence +from typing import Sequence, Union import tvm -from tvm.ir import IRModule, structural_equal +from tvm.ir import IRModule, assert_structural_equal from tvm.tir import PrimFunc -from tvm.tir.schedule import Trace, Schedule +from tvm.tir.schedule import Schedule, Trace def verify_trace_roundtrip( @@ -70,7 +70,7 @@ def verify_trace_roundtrip( assert text_format in ("json", "python"), f"Unknown text format: {text_format}" # Step 2. Verify that the round-trip produced the same scheduling - assert structural_equal(new_sch.mod, sch.mod) + assert_structural_equal(new_sch.mod, sch.mod) # Step 3. Check the consistency of the text format between the old and new traces py_repr = "\n".join(trace.as_python()) diff --git a/src/meta_schedule/schedule_rule/add_rfactor.cc b/src/meta_schedule/schedule_rule/add_rfactor.cc index 5ef2ac3aad36..cf87f24ac233 100644 --- a/src/meta_schedule/schedule_rule/add_rfactor.cc +++ b/src/meta_schedule/schedule_rule/add_rfactor.cc @@ -90,8 +90,7 @@ Array AddRFactorNode::Apply(const tir::Schedule& sch, const tir:: // Split the fused reduction loop. Array factors = sch->SamplePerfectTile(fused_reduce_loop, 2, max_innermost_factor); - const Array& split_loops = - sch->Split(fused_reduce_loop, {factors.begin(), factors.end()}); + Array split_loops = sch->Split(fused_reduce_loop, {factors.begin(), factors.end()}); Array res; for (const tir::LoopRV& split_loop : split_loops) { @@ -104,7 +103,7 @@ Array AddRFactorNode::Apply(const tir::Schedule& sch, const tir:: // Annotate that the rfactor block, which is now the producer of the original block, needs to // be considered by the rule Random-Compute-Location. - sch_tmp->Annotate(block_rv, tir::attr::meta_schedule_random_compute_producer, Bool(true)); + sch_tmp->Annotate(block_rv, tir::attr::meta_schedule_random_compute_producer, Integer(1)); res.push_back(sch_tmp); } catch (const tvm::runtime::Error& e) { } diff --git a/src/tir/schedule/primitive/sampling.cc b/src/tir/schedule/primitive/sampling.cc index b1001a7f9455..ec12b045d3f0 100644 --- a/src/tir/schedule/primitive/sampling.cc +++ b/src/tir/schedule/primitive/sampling.cc @@ -338,7 +338,9 @@ std::vector SamplePerfectTile( } else { // Case 3. Use fresh new sampling result result = SamplePerfectTile(rand_state, *extent, n_splits, max_innermost_factor); - ICHECK_LE(result.back(), max_innermost_factor); + if (max_innermost_factor != -1) { + ICHECK_LE(result.back(), max_innermost_factor); + } } *decision = support::AsArray(result); return result; diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py index a39c8aea5fb6..17f42654fcf7 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py @@ -15,62 +15,108 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring - -from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply +from tvm import meta_schedule as ms from tvm.meta_schedule.testing import te_workload -from tvm.meta_schedule.testing.schedule_rule import add_rfactor -from tvm.meta_schedule.testing.space_generation import check_trace -from tvm.meta_schedule.tune_context import TuneContext +from tvm.meta_schedule.testing.space_generation import check_sketches +from tvm.script import tir as T from tvm.target import Target -from tvm.te.operation import create_prim_func +from tvm.te import create_prim_func -def _create_context(mod, target, rule) -> TuneContext: - ctx = TuneContext( - mod=mod, - target=target, - space_generator=PostOrderApply(), - sch_rules=[rule], - task_name="test", - ) - return ctx +def test_cpu_matmul(): + @T.prim_func + def cpu_matmul_0( + A: T.Buffer[(4, 512), "float32"], + B: T.Buffer[(512, 4), "float32"], + C: T.Buffer[(4, 4), "float32"], + ) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + for i0, i1, i2 in T.grid(4, 4, 512): + with T.block("C"): + i, j, k = T.axis.remap("SSR", [i0, i1, i2]) + T.reads(A[i, k], B[k, j]) + T.writes(C[i, j]) + with T.init(): + C[i, j] = T.float32(0) + C[i, j] = C[i, j] + A[i, k] * B[k, j] + @T.prim_func + def cpu_matmul_1( + A: T.Buffer[(4, 512), "float32"], + B: T.Buffer[(512, 4), "float32"], + C: T.Buffer[(4, 4), "float32"], + ) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + C_rf = T.alloc_buffer([4, 4, 128], dtype="float32") + for i0, i1, i2_0, i2_1 in T.grid(4, 4, 4, 128): + with T.block("C_rf"): + vi2_1, i, j, vi2_0 = T.axis.remap("SSSR", [i2_1, i0, i1, i2_0]) + T.reads(A[i, vi2_0 * 128 + vi2_1], B[vi2_0 * 128 + vi2_1, j]) + T.writes(C_rf[i, j, vi2_1]) + with T.init(): + C_rf[i, j, vi2_1] = T.float32(0) + C_rf[i, j, vi2_1] = ( + C_rf[i, j, vi2_1] + A[i, vi2_0 * 128 + vi2_1] * B[vi2_0 * 128 + vi2_1, j] + ) + for i0, i1, i2_1 in T.grid(4, 4, 128): + with T.block("C"): + vi2_1, i, j = T.axis.remap("RSS", [i2_1, i0, i1]) + T.reads(C_rf[i, j, vi2_1]) + T.writes(C[i, j]) + T.block_attr({"meta_schedule.random_compute_producer": 1}) + with T.init(): + C[i, j] = T.float32(0) + C[i, j] = C[i, j] + C_rf[i, j, vi2_1] -def test_cpu_matmul(): - expected = [ - [], - [ - 'b0 = sch.get_block(name="C", func_name="main")', - "l1, l2, l3 = sch.get_loops(block=b0)", - "v4, v5 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)", - "l6, l7 = sch.split(loop=l3, factors=[v4, v5], preserve_unit_iters=True)", - "b8 = sch.rfactor(loop=l7, factor_axis=2)", - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.random_compute_producer", ann_val=1)', - ], - [ - 'b0 = sch.get_block(name="C", func_name="main")', - "l1, l2, l3 = sch.get_loops(block=b0)", - "v4, v5 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)", - "l6, l7 = sch.split(loop=l3, factors=[v4, v5], preserve_unit_iters=True)", - "b8 = sch.rfactor(loop=l6, factor_axis=2)", - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.random_compute_producer", ann_val=1)', - ], + @T.prim_func + def cpu_matmul_2( + A: T.Buffer[(4, 512), "float32"], + B: T.Buffer[(512, 4), "float32"], + C: T.Buffer[(4, 4), "float32"], + ) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + C_rf = T.alloc_buffer([4, 4, 4], dtype="float32") + for i0, i1, i2_0, i2_1 in T.grid(4, 4, 4, 128): + with T.block("C_rf"): + vi2_0, i, j, vi2_1 = T.axis.remap("SSSR", [i2_0, i0, i1, i2_1]) + T.reads(A[i, vi2_0 * 128 + vi2_1], B[vi2_0 * 128 + vi2_1, j]) + T.writes(C_rf[i, j, vi2_0]) + with T.init(): + C_rf[i, j, vi2_0] = T.float32(0) + C_rf[i, j, vi2_0] = ( + C_rf[i, j, vi2_0] + A[i, vi2_0 * 128 + vi2_1] * B[vi2_0 * 128 + vi2_1, j] + ) + for i0, i1, i2_0 in T.grid(4, 4, 4): + with T.block("C"): + vi2_0, i, j = T.axis.remap("RSS", [i2_0, i0, i1]) + T.reads(C_rf[i, j, vi2_0]) + T.writes(C[i, j]) + T.block_attr({"meta_schedule.random_compute_producer": 1}) + with T.init(): + C[i, j] = T.float32(0) + C[i, j] = C[i, j] + C_rf[i, j, vi2_0] + + decision_0 = [] # type: ignore + decision_1 = [ + ("SamplePerfectTile", [4, 128]), + ] + decision_2 = [ + ("SamplePerfectTile", [4, 128]), ] - target = Target("llvm --num-cores=32") - ctx = _create_context( - create_prim_func( - te_workload.matmul( - n=4, - m=4, - k=512, - ) - ), - target=target, - rule=add_rfactor(target=target), + mod = create_prim_func(te_workload.matmul(n=4, m=4, k=512)) + actual = ms.TuneContext( + mod=mod, + target=Target("llvm --num-cores=32"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ms.schedule_rule.AddRFactor()], + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[cpu_matmul_0, cpu_matmul_1, cpu_matmul_2], + expected_decisions=[decision_0, decision_1, decision_2], ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 3 - check_trace(spaces, expected) if __name__ == "__main__": From ef784d68e04ab4b858ce4c953b2d83b5d5811eda Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Tue, 13 Sep 2022 02:20:30 -0700 Subject: [PATCH 156/704] [MetaSchedule][Test] Migrate `check_trace` to `check_sketch` (#12764) * Migrate AutoBind * Migrate RandomComputeLocation * Migrate CrossThreadReduction * Migrate ParallelVectorizeUnroll --- .../meta_schedule/testing/schedule_rule.py | 48 +- ...t_meta_schedule_schedule_rule_auto_bind.py | 175 +++-- ...le_schedule_rule_cross_thread_reduction.py | 665 +++++++++++++----- ...schedule_rule_parallel_vectorize_unroll.py | 111 +-- ...e_schedule_rule_random_compute_location.py | 72 +- 5 files changed, 718 insertions(+), 353 deletions(-) diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py index b08db0811dd3..12ca4200d77a 100644 --- a/python/tvm/meta_schedule/testing/schedule_rule.py +++ b/python/tvm/meta_schedule/testing/schedule_rule.py @@ -18,28 +18,15 @@ from typing import List, Union from tvm.meta_schedule.schedule_rule import ( - AutoBind, AutoInline, - CrossThreadReduction, MultiLevelTiling, - ParallelizeVectorizeUnroll, - RandomComputeLocation, + MultiLevelTilingTensorCore, ReuseType, ScheduleRule, ) -from tvm.meta_schedule.schedule_rule.multi_level_tiling import ( - MultiLevelTilingTensorCore, -) from tvm.target import Target -def auto_bind(target: Target) -> ScheduleRule: - """Default schedule rules for auto bind""" - if target.kind.name == "cuda": - return AutoBind(max_threadblocks=256, thread_extents=[32, 64, 128, 256, 512, 1024]) - raise NotImplementedError(f"{target.kind.name} is not supported") - - def auto_inline(target: Target) -> ScheduleRule: """Default schedule rules for auto inline""" if target.kind.name == "llvm": @@ -65,13 +52,6 @@ def auto_inline(target: Target) -> ScheduleRule: raise NotImplementedError(f"{target.kind.name} is not supported") -def cross_thread_reduction(target: Target) -> ScheduleRule: - """Default schedule rules for with cross-thread reduction""" - if target.kind.name == "cuda": - return CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]) - raise NotImplementedError(f"{target.kind.name} is not supported") - - def multi_level_tiling(target: Target) -> ScheduleRule: """Default schedule rules for with multi-level tiling and reuse""" if target.kind.name == "llvm": @@ -154,29 +134,3 @@ def multi_level_tiling_tensor_core( use_software_pipeline=use_software_pipeline, ) raise NotImplementedError(f"{target.kind.name} is not supported") - - -def random_compute_location(target: Target) -> ScheduleRule: - """Default schedule rules for with random-compute-location""" - if target.kind.name == "llvm": - return RandomComputeLocation() - raise NotImplementedError(f"{target.kind.name} is not supported") - - -def parallel_vectorize_unroll(target: Target) -> ScheduleRule: - """Default schedule rules for with parallel-vectorize-unroll""" - if target.kind.name == "llvm": - return ParallelizeVectorizeUnroll( - max_jobs_per_core=16, - max_vectorize_extent=32, - unroll_max_steps=[0, 16, 64, 512], - unroll_explicit=True, - ) - if target.kind.name == "cuda": - return ParallelizeVectorizeUnroll( - max_jobs_per_core=-1, - max_vectorize_extent=-1, - unroll_max_steps=[0, 16, 64, 512, 1024], - unroll_explicit=True, - ) - raise NotImplementedError(f"{target.kind.name} is not supported") diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py index a89cca72e1b1..21ad04da473e 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py @@ -15,10 +15,8 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring -from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply -from tvm.meta_schedule.testing.schedule_rule import auto_bind -from tvm.meta_schedule.testing.space_generation import check_trace -from tvm.meta_schedule.tune_context import TuneContext +from tvm import meta_schedule as ms +from tvm.meta_schedule.testing.space_generation import check_sketches from tvm.script import tir as T from tvm.target import Target @@ -60,83 +58,120 @@ def zero_dim_add( C[()] = A[()] + B[()] -def _create_context(mod, target, rule) -> TuneContext: - ctx = TuneContext( - mod=mod, - target=target, - space_generator=PostOrderApply(), - sch_rules=[rule], - task_name="test", - ) - return ctx - - def test_cuda_element_wise(): - expected = [ - [ - 'b0 = sch.get_block(name="C", func_name="main")', - "l1, l2 = sch.get_loops(block=b0)", - "l3 = sch.fuse(l1, l2, preserve_unit_iters=True)", - "v4 = sch.sample_categorical(candidates=[32, 64, 128, 256, 512, 1024], probs=[0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666])", - "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)", - 'sch.bind(loop=l5, thread_axis="blockIdx.x")', - 'sch.bind(loop=l6, thread_axis="threadIdx.x")', - ] + @T.prim_func + def elementwise_0( + A: T.Buffer[(512, 512), "float32"], + B: T.Buffer[(512, 512), "float32"], + ) -> None: + # body + # with T.block("root") + for i_j_fused_0 in T.thread_binding(256, thread="blockIdx.x"): + for i_j_fused_1 in T.thread_binding(1024, thread="threadIdx.x"): + with T.block("C"): + vi = T.axis.spatial(512, (i_j_fused_0 * 1024 + i_j_fused_1) // 512) + vj = T.axis.spatial(512, (i_j_fused_0 * 1024 + i_j_fused_1) % 512) + T.reads(A[vi, vj]) + T.writes(B[vi, vj]) + B[vi, vj] = A[vi, vj] + T.float32(1) + + decision_0 = [ + ("SampleCategorical", 5), ] - target = Target("nvidia/geforce-rtx-3080", host="llvm") - ctx = _create_context( - element_wise, - target=target, - rule=auto_bind(target=target), + mod = element_wise + actual = ms.TuneContext( + mod=mod, + target=Target("nvidia/geforce-rtx-3080", host="llvm"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + ms.schedule_rule.AutoBind( + max_threadblocks=256, + thread_extents=[32, 64, 128, 256, 512, 1024], + ) + ], + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[elementwise_0], + expected_decisions=[decision_0], ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 1 - check_trace(spaces, expected) def test_cuda_reduction_loop_only(): - expected = [ - [ - 'b0 = sch.get_block(name="C", func_name="main")', - "l1, = sch.get_loops(block=b0)", - "l2 = sch.add_unit_loop(block_or_loop=l1)", - "l3 = sch.fuse(l2, preserve_unit_iters=True)", - "l4, l5 = sch.split(loop=l3, factors=[None, 1], preserve_unit_iters=True)", - 'sch.bind(loop=l4, thread_axis="blockIdx.x")', - 'sch.bind(loop=l5, thread_axis="threadIdx.x")', - ] - ] - target = Target("nvidia/geforce-rtx-3080", host="llvm") - ctx = _create_context( - reduction_loop_only, - target=target, - rule=auto_bind(target=target), + @T.prim_func + def reduction_loop_only_0( + A: T.Buffer[2, "float32"], + B: T.Buffer[2, "float32"], + C: T.Buffer[(), "float32"], + ) -> None: + for u_fused_0 in T.thread_binding(1, thread="blockIdx.x"): + for u_fused_1 in T.thread_binding(1, thread="threadIdx.x"): + for i0 in T.serial(2): + with T.block("C"): + k0 = T.axis.reduce(2, i0) + T.reads(A[k0], B[k0]) + T.writes(C[()]) + with T.init(): + C[()] = T.float32(1) + C[()] = T.min(C[()], A[k0] / B[k0]) + + mod = reduction_loop_only + actual = ms.TuneContext( + mod=mod, + target=Target("nvidia/geforce-rtx-3080", host="llvm"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + ms.schedule_rule.AutoBind( + max_threadblocks=256, + thread_extents=[32, 64, 128, 256, 512, 1024], + ) + ], + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[reduction_loop_only_0], + expected_decisions=[[]], ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 1 - check_trace(spaces, expected) def test_cuda_zero_dim_add(): - expected = [ - [ - 'b0 = sch.get_block(name="C", func_name="main")', - "l1 = sch.add_unit_loop(block_or_loop=b0)", - "l2 = sch.fuse(l1, preserve_unit_iters=True)", - "l3, l4 = sch.split(loop=l2, factors=[None, 1], preserve_unit_iters=True)", - 'sch.bind(loop=l3, thread_axis="blockIdx.x")', - 'sch.bind(loop=l4, thread_axis="threadIdx.x")', - ] - ] - target = Target("nvidia/geforce-rtx-3080", host="llvm") - ctx = _create_context( - zero_dim_add, - target=target, - rule=auto_bind(target=target), + @T.prim_func + def zero_dim_add_0( + A: T.Buffer[(), "float32"], + B: T.Buffer[(), "float32"], + C: T.Buffer[(), "float32"], + ) -> None: + for u_fused_0 in T.thread_binding(1, thread="blockIdx.x"): + for u_fused_1 in T.thread_binding(1, thread="threadIdx.x"): + with T.block("C"): + vi = T.axis.spatial(1, 0) + T.reads(A[()], B[()]) + T.writes(C[()]) + C[()] = A[()] + B[()] + + mod = zero_dim_add + actual = ms.TuneContext( + mod=mod, + target=Target("nvidia/geforce-rtx-3080", host="llvm"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + ms.schedule_rule.AutoBind( + max_threadblocks=256, + thread_extents=[32, 64, 128, 256, 512, 1024], + ) + ], + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[zero_dim_add_0], + expected_decisions=[[]], ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 1 - check_trace(spaces, expected) if __name__ == "__main__": diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py index 592d32d6245d..a0ca47c09a34 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py @@ -17,14 +17,12 @@ # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring import tvm -from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply +from tvm import meta_schedule as ms from tvm.meta_schedule.testing import te_workload -from tvm.meta_schedule.testing.schedule_rule import cross_thread_reduction -from tvm.meta_schedule.testing.space_generation import check_trace -from tvm.meta_schedule.tune_context import TuneContext +from tvm.meta_schedule.testing.space_generation import check_sketches from tvm.script import tir as T from tvm.target import Target -from tvm.te.operation import create_prim_func +from tvm.te import create_prim_func @tvm.script.ir_module @@ -59,179 +57,522 @@ def main( ) -def _create_context(mod, target, rule) -> TuneContext: - ctx = TuneContext( - mod=mod, - target=target, - space_generator=PostOrderApply(), - sch_rules=[rule], - task_name="test", - ) - return ctx +def test_gpu_softmax_mn(): + @T.prim_func + def softmax_mn_0( + A: T.Buffer[(256, 256), "float32"], + T_softmax_norm: T.Buffer[(256, 256), "float32"], + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + T_softmax_maxelem = T.alloc_buffer([256], dtype="float32") + T_softmax_exp = T.alloc_buffer([256, 256], dtype="float32") + T_softmax_expsum = T.alloc_buffer([256], dtype="float32") + for i0, i1 in T.grid(256, 256): + with T.block("T_softmax_maxelem"): + i0_1, k = T.axis.remap("SR", [i0, i1]) + T.reads(A[i0_1, k]) + T.writes(T_softmax_maxelem[i0_1]) + with T.init(): + T_softmax_maxelem[i0_1] = T.float32(-3.4028234663852886e38) + T_softmax_maxelem[i0_1] = T.max(T_softmax_maxelem[i0_1], A[i0_1, k]) + for i0, i1 in T.grid(256, 256): + with T.block("T_softmax_exp"): + i0_2, i1_1 = T.axis.remap("SS", [i0, i1]) + T.reads(A[i0_2, i1_1], T_softmax_maxelem[i0_2]) + T.writes(T_softmax_exp[i0_2, i1_1]) + T_softmax_exp[i0_2, i1_1] = T.exp( + A[i0_2, i1_1] - T_softmax_maxelem[i0_2], dtype="float32" + ) + for i0_3, i1 in T.grid(256, 256): + with T.block("T_softmax_expsum"): + i0_4, k = T.axis.remap("SR", [i0_3, i1]) + T.reads(T_softmax_exp[i0_4, k]) + T.writes(T_softmax_expsum[i0_4]) + with T.init(): + T_softmax_expsum[i0_4] = T.float32(0) + T_softmax_expsum[i0_4] = T_softmax_expsum[i0_4] + T_softmax_exp[i0_4, k] + for i0_5, i1 in T.grid(256, 256): + with T.block("T_softmax_norm"): + i0_6, i1_2 = T.axis.remap("SS", [i0_5, i1]) + T.reads(T_softmax_exp[i0_6, i1_2], T_softmax_expsum[i0_6]) + T.writes(T_softmax_norm[i0_6, i1_2]) + T.block_attr({"axis": 1}) + T_softmax_norm[i0_6, i1_2] = T_softmax_exp[i0_6, i1_2] / T_softmax_expsum[i0_6] + @T.prim_func + def softmax_mn_1( + A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256), "float32"] + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + T_softmax_maxelem_shared = T.alloc_buffer([256], dtype="float32", scope="shared") + T_softmax_exp = T.alloc_buffer([256, 256], dtype="float32") + T_softmax_expsum = T.alloc_buffer([256], dtype="float32") + for i0 in T.serial(256): + for ax0, ax1_0 in T.grid(1, 1): + for ax1_1 in T.thread_binding(512, thread="threadIdx.x"): + with T.block("T_softmax_maxelem"): + T.where(ax1_0 * 512 + ax1_1 < 256) + i0_1 = T.axis.spatial(256, ax0 + i0) + k = T.axis.reduce(256, ax1_0 * 512 + ax1_1) + T.reads(A[i0_1, k]) + T.writes(T_softmax_maxelem_shared[i0_1]) + with T.init(): + T_softmax_maxelem_shared[i0_1] = T.float32(-3.4028234663852886e38) + T_softmax_maxelem_shared[i0_1] = T.max( + T_softmax_maxelem_shared[i0_1], A[i0_1, k] + ) + for i1_0 in T.serial(1): + for i1_1 in T.thread_binding(512, thread="threadIdx.x"): + with T.block("T_softmax_exp"): + T.where(i1_0 * 512 + i1_1 < 256) + i0_2 = T.axis.spatial(256, i0) + i1 = T.axis.spatial(256, i1_0 * 512 + i1_1) + T.reads(A[i0_2, i1], T_softmax_maxelem_shared[i0_2]) + T.writes(T_softmax_exp[i0_2, i1]) + T_softmax_exp[i0_2, i1] = T.exp( + A[i0_2, i1] - T_softmax_maxelem_shared[i0_2], dtype="float32" + ) + for i0_3, i1 in T.grid(256, 256): + with T.block("T_softmax_expsum"): + i0_4, k = T.axis.remap("SR", [i0_3, i1]) + T.reads(T_softmax_exp[i0_4, k]) + T.writes(T_softmax_expsum[i0_4]) + with T.init(): + T_softmax_expsum[i0_4] = T.float32(0) + T_softmax_expsum[i0_4] = T_softmax_expsum[i0_4] + T_softmax_exp[i0_4, k] + for i0_5, i1 in T.grid(256, 256): + with T.block("T_softmax_norm"): + i0_6, i1_2 = T.axis.remap("SS", [i0_5, i1]) + T.reads(T_softmax_exp[i0_6, i1_2], T_softmax_expsum[i0_6]) + T.writes(T_softmax_norm[i0_6, i1_2]) + T.block_attr({"axis": 1}) + T_softmax_norm[i0_6, i1_2] = T_softmax_exp[i0_6, i1_2] / T_softmax_expsum[i0_6] -def test_gpu_softmax_mn(): - expected = [ - [], - [ - 'b0 = sch.get_block(name="T_softmax_maxelem", func_name="main")', - "b1, = sch.get_consumers(block=b0)", - "l2, l3 = sch.get_loops(block=b1)", - "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", - "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)", - 'sch.bind(loop=l6, thread_axis="threadIdx.x")', - "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True, index=-1)", - 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', - "l7, l8, l9 = sch.get_loops(block=b0)", - "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)", - 'sch.bind(loop=l11, thread_axis="threadIdx.x")', - ], - [ - 'b0 = sch.get_block(name="T_softmax_expsum", func_name="main")', - "b1, = sch.get_consumers(block=b0)", - "l2, l3 = sch.get_loops(block=b1)", - "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", - "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)", - 'sch.bind(loop=l6, thread_axis="threadIdx.x")', - "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True, index=-1)", - 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', - "l7, l8, l9 = sch.get_loops(block=b0)", - "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)", - 'sch.bind(loop=l11, thread_axis="threadIdx.x")', - ], - [ - 'b0 = sch.get_block(name="T_softmax_maxelem", func_name="main")', - 'b1 = sch.get_block(name="T_softmax_expsum", func_name="main")', - "b2, = sch.get_consumers(block=b1)", - "l3, l4 = sch.get_loops(block=b2)", - "v5 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", - "l6, l7 = sch.split(loop=l4, factors=[None, v5], preserve_unit_iters=True)", - 'sch.bind(loop=l7, thread_axis="threadIdx.x")', - "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True, index=-1)", - 'sch.set_scope(block=b1, buffer_index=0, storage_scope="shared")', - "l8, l9, l10 = sch.get_loops(block=b1)", - "l11, l12 = sch.split(loop=l10, factors=[None, v5], preserve_unit_iters=True)", - 'sch.bind(loop=l12, thread_axis="threadIdx.x")', - "b13, = sch.get_consumers(block=b0)", - "l14, l15 = sch.get_loops(block=b13)", - "v16 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", - "l17, l18 = sch.split(loop=l15, factors=[None, v16], preserve_unit_iters=True)", - 'sch.bind(loop=l18, thread_axis="threadIdx.x")', - "sch.compute_at(block=b0, loop=l14, preserve_unit_loops=True, index=-1)", - 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', - "l19, l20, l21 = sch.get_loops(block=b0)", - "l22, l23 = sch.split(loop=l21, factors=[None, v16], preserve_unit_iters=True)", - 'sch.bind(loop=l23, thread_axis="threadIdx.x")', - ], + @T.prim_func + def softmax_mn_2( + A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256), "float32"] + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + T_softmax_maxelem = T.alloc_buffer([256], dtype="float32") + T_softmax_exp = T.alloc_buffer([256, 256], dtype="float32") + T_softmax_expsum_shared = T.alloc_buffer([256], dtype="float32", scope="shared") + for i0, i1 in T.grid(256, 256): + with T.block("T_softmax_maxelem"): + i0_1, k = T.axis.remap("SR", [i0, i1]) + T.reads(A[i0_1, k]) + T.writes(T_softmax_maxelem[i0_1]) + with T.init(): + T_softmax_maxelem[i0_1] = T.float32(-3.4028234663852886e38) + T_softmax_maxelem[i0_1] = T.max(T_softmax_maxelem[i0_1], A[i0_1, k]) + for i0, i1 in T.grid(256, 256): + with T.block("T_softmax_exp"): + i0_2, i1_1 = T.axis.remap("SS", [i0, i1]) + T.reads(A[i0_2, i1_1], T_softmax_maxelem[i0_2]) + T.writes(T_softmax_exp[i0_2, i1_1]) + T_softmax_exp[i0_2, i1_1] = T.exp( + A[i0_2, i1_1] - T_softmax_maxelem[i0_2], dtype="float32" + ) + for i0_3 in T.serial(256): + for ax0, ax1_0 in T.grid(1, 32): + for ax1_1 in T.thread_binding(8, thread="threadIdx.x"): + with T.block("T_softmax_expsum"): + i0_4 = T.axis.spatial(256, ax0 + i0_3) + k = T.axis.reduce(256, ax1_0 * 8 + ax1_1) + T.reads(T_softmax_exp[i0_4, k]) + T.writes(T_softmax_expsum_shared[i0_4]) + with T.init(): + T_softmax_expsum_shared[i0_4] = T.float32(0) + T_softmax_expsum_shared[i0_4] = ( + T_softmax_expsum_shared[i0_4] + T_softmax_exp[i0_4, k] + ) + for i1_0 in T.serial(32): + for i1_1_1 in T.thread_binding(8, thread="threadIdx.x"): + with T.block("T_softmax_norm"): + i0_5 = T.axis.spatial(256, i0_3) + i1 = T.axis.spatial(256, i1_0 * 8 + i1_1_1) + T.reads(T_softmax_exp[i0_5, i1], T_softmax_expsum_shared[i0_5]) + T.writes(T_softmax_norm[i0_5, i1]) + T.block_attr({"axis": 1}) + T_softmax_norm[i0_5, i1] = ( + T_softmax_exp[i0_5, i1] / T_softmax_expsum_shared[i0_5] + ) + + @T.prim_func + def softmax_mn_3( + A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256), "float32"] + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + T_softmax_maxelem_shared = T.alloc_buffer([256], dtype="float32", scope="shared") + T_softmax_exp = T.alloc_buffer([256, 256], dtype="float32") + T_softmax_expsum_shared = T.alloc_buffer([256], dtype="float32", scope="shared") + for i0 in T.serial(256): + for ax0, ax1_0 in T.grid(1, 1): + for ax1_1 in T.thread_binding(512, thread="threadIdx.x"): + with T.block("T_softmax_maxelem"): + T.where(ax1_0 * 512 + ax1_1 < 256) + i0_1 = T.axis.spatial(256, ax0 + i0) + k = T.axis.reduce(256, ax1_0 * 512 + ax1_1) + T.reads(A[i0_1, k]) + T.writes(T_softmax_maxelem_shared[i0_1]) + with T.init(): + T_softmax_maxelem_shared[i0_1] = T.float32(-3.4028234663852886e38) + T_softmax_maxelem_shared[i0_1] = T.max( + T_softmax_maxelem_shared[i0_1], A[i0_1, k] + ) + for i1_0 in T.serial(1): + for i1_1 in T.thread_binding(512, thread="threadIdx.x"): + with T.block("T_softmax_exp"): + T.where(i1_0 * 512 + i1_1 < 256) + i0_2 = T.axis.spatial(256, i0) + i1 = T.axis.spatial(256, i1_0 * 512 + i1_1) + T.reads(A[i0_2, i1], T_softmax_maxelem_shared[i0_2]) + T.writes(T_softmax_exp[i0_2, i1]) + T_softmax_exp[i0_2, i1] = T.exp( + A[i0_2, i1] - T_softmax_maxelem_shared[i0_2], dtype="float32" + ) + for i0_3 in T.serial(256): + for ax0, ax1_0 in T.grid(1, 32): + for ax1_1 in T.thread_binding(8, thread="threadIdx.x"): + with T.block("T_softmax_expsum"): + i0_4 = T.axis.spatial(256, ax0 + i0_3) + k = T.axis.reduce(256, ax1_0 * 8 + ax1_1) + T.reads(T_softmax_exp[i0_4, k]) + T.writes(T_softmax_expsum_shared[i0_4]) + with T.init(): + T_softmax_expsum_shared[i0_4] = T.float32(0) + T_softmax_expsum_shared[i0_4] = ( + T_softmax_expsum_shared[i0_4] + T_softmax_exp[i0_4, k] + ) + for i1_0 in T.serial(32): + for i1_1 in T.thread_binding(8, thread="threadIdx.x"): + with T.block("T_softmax_norm"): + i0_5 = T.axis.spatial(256, i0_3) + i1 = T.axis.spatial(256, i1_0 * 8 + i1_1) + T.reads(T_softmax_exp[i0_5, i1], T_softmax_expsum_shared[i0_5]) + T.writes(T_softmax_norm[i0_5, i1]) + T.block_attr({"axis": 1}) + T_softmax_norm[i0_5, i1] = ( + T_softmax_exp[i0_5, i1] / T_softmax_expsum_shared[i0_5] + ) + + decision_0 = [] # type: ignore + decision_1 = [ + ("SampleCategorical", 7), + ] + decision_2 = [ + ("SampleCategorical", 1), + ] + decision_3 = [ + ("SampleCategorical", 1), + ("SampleCategorical", 7), ] - target = Target("nvidia/geforce-rtx-3090", host="llvm") - ctx = _create_context( - create_prim_func( - te_workload.softmax_mn( - n=256, - m=256, - ) - ), - target=target, - rule=cross_thread_reduction(target=target), + mod = create_prim_func(te_workload.softmax_mn(n=256, m=256)) + actual = ms.TuneContext( + mod=mod, + target=Target("nvidia/geforce-rtx-3090", host="llvm"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]) + ], + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[softmax_mn_0, softmax_mn_1, softmax_mn_2, softmax_mn_3], + expected_decisions=[decision_0, decision_1, decision_2, decision_3], ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 4 - check_trace(spaces, expected) def test_gpu_softmax_mn_after_inline(): - expected = [ - [], - [ - 'b0 = sch.get_block(name="T_softmax_maxelem", func_name="main")', - "v1 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", - "l2, l3 = sch.get_loops(block=b0)", - "l4, l5 = sch.split(loop=l3, factors=[None, v1], preserve_unit_iters=True)", - 'sch.bind(loop=l5, thread_axis="threadIdx.x")', - ], - [ - 'b0 = sch.get_block(name="T_softmax_expsum", func_name="main")', - "b1, = sch.get_consumers(block=b0)", - "l2, l3 = sch.get_loops(block=b1)", - "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", - "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)", - 'sch.bind(loop=l6, thread_axis="threadIdx.x")', - "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True, index=-1)", - 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', - "l7, l8, l9 = sch.get_loops(block=b0)", - "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)", - 'sch.bind(loop=l11, thread_axis="threadIdx.x")', + @T.prim_func + def softmax_mn_after_inline_0( + A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256), "float32"] + ) -> None: + T_softmax_maxelem = T.alloc_buffer([256], dtype="float32") + T_softmax_expsum = T.alloc_buffer([256], dtype="float32") + for i0, i1 in T.grid(256, 256): + with T.block("T_softmax_maxelem"): + i0_1, k = T.axis.remap("SR", [i0, i1]) + T.reads(A[i0_1, k]) + T.writes(T_softmax_maxelem[i0_1]) + with T.init(): + T_softmax_maxelem[i0_1] = T.float32(-3.4028234663852886e38) + T_softmax_maxelem[i0_1] = T.max(T_softmax_maxelem[i0_1], A[i0_1, k]) + for i0, i1 in T.grid(256, 256): + with T.block("T_softmax_expsum"): + i0_2, k = T.axis.remap("SR", [i0, i1]) + T.reads(A[i0_2, k], T_softmax_maxelem[i0_2]) + T.writes(T_softmax_expsum[i0_2]) + with T.init(): + T_softmax_expsum[i0_2] = T.float32(0) + T_softmax_expsum[i0_2] = T_softmax_expsum[i0_2] + T.exp( + A[i0_2, k] - T_softmax_maxelem[i0_2], dtype="float32" + ) + for i0_3, i1 in T.grid(256, 256): + with T.block("T_softmax_norm"): + i0_4, i1_1 = T.axis.remap("SS", [i0_3, i1]) + T.reads(A[i0_4, i1_1], T_softmax_maxelem[i0_4], T_softmax_expsum[i0_4]) + T.writes(T_softmax_norm[i0_4, i1_1]) + T.block_attr({"axis": 1}) + T_softmax_norm[i0_4, i1_1] = ( + T.exp(A[i0_4, i1_1] - T_softmax_maxelem[i0_4], dtype="float32") + / T_softmax_expsum[i0_4] + ) + + @T.prim_func + def softmax_mn_after_inline_1( + A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256), "float32"] + ) -> None: + T_softmax_maxelem = T.alloc_buffer([256], dtype="float32") + T_softmax_expsum = T.alloc_buffer([256], dtype="float32") + for i0, i1_0 in T.grid(256, 4): + for i1_1 in T.thread_binding(64, thread="threadIdx.x"): + with T.block("T_softmax_maxelem"): + i0_1 = T.axis.spatial(256, i0) + k = T.axis.reduce(256, i1_0 * 64 + i1_1) + T.reads(A[i0_1, k]) + T.writes(T_softmax_maxelem[i0_1]) + with T.init(): + T_softmax_maxelem[i0_1] = T.float32(-3.4028234663852886e38) + T_softmax_maxelem[i0_1] = T.max(T_softmax_maxelem[i0_1], A[i0_1, k]) + for i0, i1 in T.grid(256, 256): + with T.block("T_softmax_expsum"): + i0_2, k = T.axis.remap("SR", [i0, i1]) + T.reads(A[i0_2, k], T_softmax_maxelem[i0_2]) + T.writes(T_softmax_expsum[i0_2]) + with T.init(): + T_softmax_expsum[i0_2] = T.float32(0) + T_softmax_expsum[i0_2] = T_softmax_expsum[i0_2] + T.exp( + A[i0_2, k] - T_softmax_maxelem[i0_2], dtype="float32" + ) + for i0_3, i1 in T.grid(256, 256): + with T.block("T_softmax_norm"): + i0_4, i1_1 = T.axis.remap("SS", [i0_3, i1]) + T.reads(A[i0_4, i1_1], T_softmax_maxelem[i0_4], T_softmax_expsum[i0_4]) + T.writes(T_softmax_norm[i0_4, i1_1]) + T.block_attr({"axis": 1}) + T_softmax_norm[i0_4, i1_1] = ( + T.exp(A[i0_4, i1_1] - T_softmax_maxelem[i0_4], dtype="float32") + / T_softmax_expsum[i0_4] + ) + + @T.prim_func + def softmax_mn_after_inline_2( + A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256), "float32"] + ) -> None: + T_softmax_maxelem = T.alloc_buffer([256], dtype="float32") + T_softmax_expsum_shared = T.alloc_buffer([256], dtype="float32", scope="shared") + for i0, i1 in T.grid(256, 256): + with T.block("T_softmax_maxelem"): + i0_1, k = T.axis.remap("SR", [i0, i1]) + T.reads(A[i0_1, k]) + T.writes(T_softmax_maxelem[i0_1]) + with T.init(): + T_softmax_maxelem[i0_1] = T.float32(-3.4028234663852886e38) + T_softmax_maxelem[i0_1] = T.max(T_softmax_maxelem[i0_1], A[i0_1, k]) + for i0_3 in T.serial(256): + for ax0, ax1_0 in T.grid(1, 1): + for ax1_1 in T.thread_binding(512, thread="threadIdx.x"): + with T.block("T_softmax_expsum"): + T.where(ax1_0 * 512 + ax1_1 < 256) + i0_2 = T.axis.spatial(256, ax0 + i0_3) + k = T.axis.reduce(256, ax1_0 * 512 + ax1_1) + T.reads(A[i0_2, k], T_softmax_maxelem[i0_2]) + T.writes(T_softmax_expsum_shared[i0_2]) + with T.init(): + T_softmax_expsum_shared[i0_2] = T.float32(0) + T_softmax_expsum_shared[i0_2] = T_softmax_expsum_shared[i0_2] + T.exp( + A[i0_2, k] - T_softmax_maxelem[i0_2], dtype="float32" + ) + for i1_0 in T.serial(1): + for i1_1 in T.thread_binding(512, thread="threadIdx.x"): + with T.block("T_softmax_norm"): + T.where(i1_0 * 512 + i1_1 < 256) + i0_4 = T.axis.spatial(256, i0_3) + i1_1_1 = T.axis.spatial(256, i1_0 * 512 + i1_1) + T.reads( + A[i0_4, i1_1_1], T_softmax_maxelem[i0_4], T_softmax_expsum_shared[i0_4] + ) + T.writes(T_softmax_norm[i0_4, i1_1_1]) + T.block_attr({"axis": 1}) + T_softmax_norm[i0_4, i1_1_1] = ( + T.exp(A[i0_4, i1_1_1] - T_softmax_maxelem[i0_4], dtype="float32") + / T_softmax_expsum_shared[i0_4] + ) + + @T.prim_func + def softmax_mn_after_inline_3( + A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256), "float32"] + ) -> None: + T_softmax_maxelem_shared = T.alloc_buffer([256], dtype="float32", scope="shared") + T_softmax_expsum_shared = T.alloc_buffer([256], dtype="float32", scope="shared") + for i0_3 in T.serial(256): + for ax0, ax1_0 in T.grid(1, 1): + for ax1_1 in T.thread_binding(512, thread="threadIdx.x"): + with T.block("T_softmax_maxelem"): + T.where(ax1_0 * 512 + ax1_1 < 256) + i0_1 = T.axis.spatial(256, ax0 + i0_3) + k = T.axis.reduce(256, ax1_0 * 512 + ax1_1) + T.reads(A[i0_1, k]) + T.writes(T_softmax_maxelem_shared[i0_1]) + with T.init(): + T_softmax_maxelem_shared[i0_1] = T.float32(-3.4028234663852886e38) + T_softmax_maxelem_shared[i0_1] = T.max( + T_softmax_maxelem_shared[i0_1], A[i0_1, k] + ) + for ax0, ax1_0 in T.grid(1, 1): + for ax1_1 in T.thread_binding(512, thread="threadIdx.x"): + with T.block("T_softmax_expsum"): + T.where(ax1_0 * 512 + ax1_1 < 256) + i0_2 = T.axis.spatial(256, ax0 + i0_3) + k = T.axis.reduce(256, ax1_0 * 512 + ax1_1) + T.reads(A[i0_2, k], T_softmax_maxelem_shared[i0_2]) + T.writes(T_softmax_expsum_shared[i0_2]) + with T.init(): + T_softmax_expsum_shared[i0_2] = T.float32(0) + T_softmax_expsum_shared[i0_2] = T_softmax_expsum_shared[i0_2] + T.exp( + A[i0_2, k] - T_softmax_maxelem_shared[i0_2], dtype="float32" + ) + for i1_0 in T.serial(1): + for i1_1 in T.thread_binding(512, thread="threadIdx.x"): + with T.block("T_softmax_norm"): + T.where(i1_0 * 512 + i1_1 < 256) + i0_4 = T.axis.spatial(256, i0_3) + i1_1_1 = T.axis.spatial(256, i1_0 * 512 + i1_1) + T.reads( + A[i0_4, i1_1_1], + T_softmax_maxelem_shared[i0_4], + T_softmax_expsum_shared[i0_4], + ) + T.writes(T_softmax_norm[i0_4, i1_1_1]) + T.block_attr({"axis": 1}) + T_softmax_norm[i0_4, i1_1_1] = ( + T.exp(A[i0_4, i1_1_1] - T_softmax_maxelem_shared[i0_4], dtype="float32") + / T_softmax_expsum_shared[i0_4] + ) + + decision_0 = [] # type: ignore + decision_1 = [ + ("SampleCategorical", 4), + ] + decision_2 = [ + ("SampleCategorical", 7), + ] + decision_3 = [ + ("SampleCategorical", 7), + ("SampleCategorical", 0), + ] + + mod = Softmax_mn_after_inline + actual = ms.TuneContext( + mod=mod, + target=Target("nvidia/geforce-rtx-3090", host="llvm"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]) ], - [ - 'b0 = sch.get_block(name="T_softmax_maxelem", func_name="main")', - 'b1 = sch.get_block(name="T_softmax_expsum", func_name="main")', - "b2, = sch.get_consumers(block=b1)", - "l3, l4 = sch.get_loops(block=b2)", - "v5 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", - "l6, l7 = sch.split(loop=l4, factors=[None, v5], preserve_unit_iters=True)", - 'sch.bind(loop=l7, thread_axis="threadIdx.x")', - "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True, index=-1)", - 'sch.set_scope(block=b1, buffer_index=0, storage_scope="shared")', - "l8, l9, l10 = sch.get_loops(block=b1)", - "l11, l12 = sch.split(loop=l10, factors=[None, v5], preserve_unit_iters=True)", - 'sch.bind(loop=l12, thread_axis="threadIdx.x")', - "b13, b14 = sch.get_consumers(block=b0)", - "l15, l16, l17, l18 = sch.get_loops(block=b13)", - "sch.compute_at(block=b0, loop=l15, preserve_unit_loops=True, index=-1)", - 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', - "l19, l20, l21 = sch.get_loops(block=b0)", - "l22, l23 = sch.split(loop=l21, factors=[None, v5], preserve_unit_iters=True)", - 'sch.bind(loop=l23, thread_axis="threadIdx.x")', + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[ + softmax_mn_after_inline_0, + softmax_mn_after_inline_1, + softmax_mn_after_inline_2, + softmax_mn_after_inline_3, ], - ] - target = Target("nvidia/geforce-rtx-3090", host="llvm") - ctx = _create_context( - mod=Softmax_mn_after_inline, - target=target, - rule=cross_thread_reduction(target=target), + expected_decisions=[decision_0, decision_1, decision_2, decision_3], ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 4 - check_trace(spaces, expected) def test_gpu_batch_norm_bmn(): - expected = [ - [], - [ - 'b0 = sch.get_block(name="C", func_name="main")', - "b1, = sch.get_consumers(block=b0)", - "l2, = sch.get_loops(block=b1)", - "v3 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])", - "l4, l5 = sch.split(loop=l2, factors=[None, v3], preserve_unit_iters=True)", - 'sch.bind(loop=l5, thread_axis="threadIdx.x")', - "sch.compute_at(block=b0, loop=l4, preserve_unit_loops=True, index=-1)", - 'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")', - "l6, l7, l8, l9 = sch.get_loops(block=b0)", - "l10 = sch.fuse(l8, l9, preserve_unit_iters=True)", - "l11, l12 = sch.split(loop=l10, factors=[None, v3], preserve_unit_iters=True)", - 'sch.bind(loop=l12, thread_axis="threadIdx.x")', - ], + @T.prim_func + def batch_norm_bmn_0(A: T.Buffer[(1, 512, 512), "float32"], D: T.Buffer[1, "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C = T.alloc_buffer([1], dtype="float32") + for i0, i1, i2 in T.grid(1, 512, 512): + with T.block("C"): + b, i, j = T.axis.remap("SRR", [i0, i1, i2]) + T.reads(A[b, i, j]) + T.writes(C[b]) + with T.init(): + C[b] = T.float32(0) + C[b] = C[b] + A[b, i, j] * A[b, i, j] + for i0 in T.serial(1): + with T.block("D"): + b = T.axis.spatial(1, i0) + T.reads(C[b]) + T.writes(D[b]) + D[b] = T.sqrt(C[b], dtype="float32") + + @T.prim_func + def batch_norm_bmn_1(A: T.Buffer[(1, 512, 512), "float32"], D: T.Buffer[1, "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C_shared = T.alloc_buffer([1], dtype="float32", scope="shared") + for i0_0 in T.serial(1): + for ax0, ax1_ax2_fused_0 in T.grid(1, 1024): + for ax1_ax2_fused_1 in T.thread_binding(256, thread="threadIdx.x"): + with T.block("C"): + b = T.axis.spatial(1, ax0) + i = T.axis.reduce(512, (ax1_ax2_fused_0 * 256 + ax1_ax2_fused_1) // 512) + j = T.axis.reduce(512, (ax1_ax2_fused_0 * 256 + ax1_ax2_fused_1) % 512) + T.reads(A[b, i, j]) + T.writes(C_shared[b]) + with T.init(): + C_shared[b] = T.float32(0) + C_shared[b] = C_shared[b] + A[b, i, j] * A[b, i, j] + for i0_1 in T.thread_binding(256, thread="threadIdx.x"): + with T.block("D"): + T.where(i0_0 * 256 + i0_1 < 1) + b = T.axis.spatial(1, i0_0 * 256 + i0_1) + T.reads(C_shared[b]) + T.writes(D[b]) + D[b] = T.sqrt(C_shared[b], dtype="float32") + + decision_0 = [] # type: ignore + decision_1 = [ + ("SampleCategorical", 6), ] - target = Target("nvidia/geforce-rtx-3090", host="llvm") - ctx = _create_context( - create_prim_func( - te_workload.norm_bmn( - B=1, - M=512, - N=512, - ) - ), - target=target, - rule=cross_thread_reduction(target=target), + + mod = create_prim_func(te_workload.norm_bmn(B=1, M=512, N=512)) + actual = ms.TuneContext( + mod=mod, + target=Target("nvidia/geforce-rtx-3090", host="llvm"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]) + ], + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[batch_norm_bmn_0, batch_norm_bmn_1], + expected_decisions=[decision_0, decision_1], ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 2 - check_trace(spaces, expected) if __name__ == "__main__": - # test_gpu_softmax_mn() - # test_gpu_softmax_mn_after_inline() + test_gpu_softmax_mn() + test_gpu_softmax_mn_after_inline() test_gpu_batch_norm_bmn() diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py index 02b55350b7d5..8076fcaa8bd4 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py @@ -17,10 +17,7 @@ # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring import tvm from tvm import meta_schedule as ms -from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply -from tvm.meta_schedule.testing.schedule_rule import parallel_vectorize_unroll -from tvm.meta_schedule.testing.space_generation import check_trace -from tvm.meta_schedule.tune_context import TuneContext +from tvm.meta_schedule.testing.space_generation import check_sketches from tvm.script import tir as T from tvm.target import Target @@ -68,10 +65,7 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None: class PureSpatial: @T.prim_func def main(placeholder: T.Buffer[(1, 13, 13, 3, 85), "float32"], placeholder_1: T.Buffer[(1, 26, 26, 3, 85), "float32"], placeholder_2: T.Buffer[(1, 52, 52, 3, 85), "float32"], T_expand_dims: T.Buffer[(1, 80, 10647), "float32"]) -> None: - # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - # body - # with T.block("root") T_strided_slice_with_axes = T.alloc_buffer([1, 52, 52, 3, 1], dtype="float32") T_sigmoid = T.alloc_buffer([1, 52, 52, 3, 1], dtype="float32") T_strided_slice_with_axes_1 = T.alloc_buffer([1, 52, 52, 3, 80], dtype="float32") @@ -224,55 +218,80 @@ def main(placeholder: T.Buffer[(1, 13, 13, 3, 85), "float32"], placeholder_1: T. # fmt: on -def _create_context(mod, target, rule): - ctx = TuneContext( - mod=mod, - target=target, - space_generator=PostOrderApply(), - sch_rules=[rule], - task_name="test", - ) - return ctx - - def test_parallel_vectorize_unroll(): - expected = [ - [ - 'b0 = sch.get_block(name="root", func_name="main")', - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.parallel", ann_val=512)', - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.vectorize", ann_val=32)', - "v1 = sch.sample_categorical(candidates=[0, 16, 64, 512], probs=[0.25, 0.25, 0.25, 0.25])", - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.unroll_explicit", ann_val=v1)', - ] + @T.prim_func + def Matmul_0( + A: T.Buffer[(1024, 1024), "float32"], + B: T.Buffer[(1024, 1024), "float32"], + C: T.Buffer[(1024, 1024), "float32"], + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main"}) + # body + with T.block("root"): + T.reads() + T.writes() + T.block_attr( + { + "meta_schedule.parallel": 512, + "meta_schedule.unroll_explicit": 16, + "meta_schedule.vectorize": 32, + } + ) + for i, j, k in T.grid(1024, 1024, 1024): + with T.block("matmul"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + T.reads(A[vi, vk], B[vk, vj]) + T.writes(C[vi, vj]) + with T.init(): + C[vi, vj] = T.float32(0) + C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj] + + decision_0 = [ + ("SampleCategorical", 1), ] + mod = Matmul - target = Target("llvm --num-cores=32") - ctx = _create_context( + actual = ms.TuneContext( mod=mod, - target=target, - rule=parallel_vectorize_unroll(target=target), + target=Target("llvm --num-cores=32"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + ms.schedule_rule.ParallelizeVectorizeUnroll( + max_jobs_per_core=16, + max_vectorize_extent=32, + unroll_max_steps=[0, 16, 64, 512], + unroll_explicit=True, + ), + ], + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[Matmul_0], + expected_decisions=[decision_0], ) - spaces = ctx.space_generator.generate_design_space(mod=mod) - assert len(spaces) == 1 - check_trace(spaces, expected) def test_parallel_vectorize_unroll_spatial(): mod = PureSpatial - target = Target("llvm --num-cores=32") - ctx = _create_context( + actual = ms.TuneContext( mod=mod, - target=target, - rule=ms.schedule_rule.ParallelizeVectorizeUnroll( - max_jobs_per_core=-1, - max_vectorize_extent=-1, - unroll_max_steps=[1, 2, 4, 8, 16, 32, 64], - unroll_explicit=True, - ), - ) - spaces = ctx.space_generator.generate_design_space(mod=mod) - assert len(spaces) == 1 - trace = spaces[0].trace.simplified(remove_postproc=True) + target=Target("llvm --num-cores=32"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + ms.schedule_rule.ParallelizeVectorizeUnroll( + max_jobs_per_core=-1, + max_vectorize_extent=-1, + unroll_max_steps=[0, 16, 64, 512], + unroll_explicit=True, + ), + ], + task_name="test", + ).generate_design_space() + assert len(actual) == 1 + trace = actual[0].trace.simplified(remove_postproc=True) assert not trace.insts diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py index c951a5adf386..fc52aa199cc1 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py @@ -16,10 +16,8 @@ # under the License. # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring import tvm -from tvm.meta_schedule.schedule_rule import RandomComputeLocation -from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply -from tvm.meta_schedule.testing.space_generation import check_trace -from tvm.meta_schedule.tune_context import TuneContext +from tvm import meta_schedule as ms +from tvm.meta_schedule.testing.space_generation import check_sketches from tvm.script import tir as T from tvm.target import Target @@ -55,35 +53,53 @@ def main(a: T.handle, b: T.handle) -> None: # fmt: on -def _create_context(mod, target, rule): - ctx = TuneContext( - mod=mod, - target=target, - space_generator=PostOrderApply(), - sch_rules=[rule], - task_name="test", - ) - return ctx - - def test_random_compute_location(): - expected = [ - [ - 'b0 = sch.get_block(name="move", func_name="main")', - "l1 = sch.sample_compute_location(block=b0)", - "sch.compute_at(block=b0, loop=l1, preserve_unit_loops=True, index=-1)", - ] + @T.prim_func + def add_0( + A: T.Buffer[(2048, 2048, 2048), "float32"], + B: T.Buffer[(2048, 2048, 2048), "float32"], + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main"}) + # body + # with T.block("root") + A_cached = T.alloc_buffer([2048, 2048, 2048], dtype="float32") + for i0, j0, i1, j1, k0, i2 in T.grid(128, 64, 4, 4, 64, 4): + for ax0, ax1, ax2 in T.grid(1, 8, 32): + with T.block("move"): + vi = T.axis.spatial(2048, i0 * 16 + i1 * 4 + i2 + ax0) + vj = T.axis.spatial(2048, j0 * 32 + j1 * 8 + ax1) + vk = T.axis.spatial(2048, k0 * 32 + ax2) + T.reads(A[vi, vj, vk]) + T.writes(A_cached[vi, vj, vk]) + A_cached[vi, vj, vk] = A[vi, vj, vk] + for j2, k1 in T.grid(8, 32): + with T.block("add"): + vi = T.axis.spatial(2048, i0 * 16 + i1 * 4 + i2) + vj = T.axis.spatial(2048, j0 * 32 + j1 * 8 + j2) + vk = T.axis.spatial(2048, k0 * 32 + k1) + T.reads(A_cached[vi, vj, vk]) + T.writes(B[vi, vj, vk]) + B[vi, vj, vk] = A_cached[vi, vj, vk] + T.float32(1) + + decision_0 = [ + ("SampleComputeLocation", 5), ] + mod = Add - target = Target("llvm") - ctx = _create_context( + actual = ms.TuneContext( mod=mod, - target=target, - rule=RandomComputeLocation(), + target=Target("llvm"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ms.schedule_rule.RandomComputeLocation()], + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[add_0], + expected_decisions=[decision_0], ) - spaces = ctx.space_generator.generate_design_space(mod=mod) - assert len(spaces) == 1 - check_trace(spaces, expected) if __name__ == "__main__": From 8058423f096cb71952982188a5c386ad37f6105a Mon Sep 17 00:00:00 2001 From: Noah Verke Date: Tue, 13 Sep 2022 14:23:35 -0700 Subject: [PATCH 157/704] [Hexagon] Create tests to showcase vtcm loading capabilities on Hexagon. (#12667) * [Hexagon] Increase max buffer size for tvm_rpc_android to 1GB. * [Hexagon] Make errors more clear when unable to allocate VTCM buffers and throw an error to fail early. * [Hexagon] Add mem_copy_DLTensor to enable directly calling DMA for mem copies. * [Hexagon] Add new tests as examples of the performance to expect when copying data to VTCM. * [Hexagon] Reduce rpc max size. * [Hexagon] Fix test_parallel_hvx_load_vtcm.py test output to be human readable. * Comment out tests that only work on 8Gen1 HDKs to get CI to pass --- python/tvm/contrib/hexagon/session.py | 2 +- src/runtime/hexagon/hexagon_buffer.cc | 9 +- src/runtime/hexagon/hexagon_device_api.cc | 11 + .../test_parallel_hvx_load_vtcm.py | 537 ++++++++++++++++++ .../test_hexagon/test_vtcm_bandwidth.py | 169 ++++++ 5 files changed, 723 insertions(+), 5 deletions(-) create mode 100644 tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py create mode 100644 tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py index 9308e396b2a5..5619d036e283 100644 --- a/python/tvm/contrib/hexagon/session.py +++ b/python/tvm/contrib/hexagon/session.py @@ -58,7 +58,7 @@ def __init__( remote_kw: dict, session_name: str = "hexagon-rpc", remote_stack_size_bytes: int = 256 * 1024, # Min size for main thread in QuRT/sim - rpc_receive_buffer_size_bytes: int = 5 * 1024 * 1024, # Size for passing hexagon tests + rpc_receive_buffer_size_bytes: int = 256 * 1024 * 1024, # Size for passing hexagon tests ): self._launcher = launcher self._session_name: str = session_name diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc index f23317fd01ed..3ba1b5be3d3d 100644 --- a/src/runtime/hexagon/hexagon_buffer.cc +++ b/src/runtime/hexagon/hexagon_buffer.cc @@ -62,7 +62,7 @@ struct VTCMAllocation : public Allocation { // allocate nbytes of vtcm on a single page HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param(&res_info, /*vtcm_size = */ nbytes, - /*b_single_page = */ 1)); + /*b_single_page = */ 0)); // TODO(HWE): Investigate why a non-zero timeout results in // hanging, both in the simulator and on hardware. @@ -71,13 +71,14 @@ struct VTCMAllocation : public Allocation { if (context_id_) { data_ = HAP_compute_res_attr_get_vtcm_ptr(&res_info); if (!data_) { - LOG(ERROR) << "ERROR: Allocated VTCM ptr is null."; + LOG(ERROR) << "ERROR: HAP_compute_res_acquire returned nullptr when allocating VTCM."; HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_)); return; } } else { - LOG(ERROR) << "ERROR: Unable to acquire requeisted resource."; - return; + LOG(FATAL) << "FATAL: HAP_compute_res_acquire failed to acquire requested VTCM resource."; + throw std::runtime_error( + "HAP_compute_res_acquire failed to acquire requested VTCM resource."); } } ~VTCMAllocation() { diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index cf384ae88db7..fd3a0db2025b 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -184,6 +184,17 @@ void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void memcpy(static_cast(to) + to_offset, static_cast(from) + from_offset, size); } +TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy_DLTensor") + .set_body([](TVMArgs args, TVMRetValue* rv) { + DLTensor* dst = args[0]; + DLTensor* src = args[1]; + int size = args[2]; + + hexagon_user_dma_1d_sync(dst->data, src->data, size); + + *rv = static_cast(0); + }); + TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) { void* dst = args[0]; void* src = args[1]; diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py new file mode 100644 index 000000000000..c9ff07c490c8 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py @@ -0,0 +1,537 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" Test different strategies for loading data into vtcm before running HVX workloads. """ + +import numpy as np +import tvm + +from tvm.script import tir as T +from numpy.random import default_rng + +TEST_OUTPUT_TEMPLATE = "Test with {} MB of data to load... \n -No VTCM: {} Gops \n -Basic VTCM: {} Gops \n -Vectorized: {} Gops\n -Vectorized and Parallelized: {} Gops\n -Preallocated and Vectorized: {} Gops\n -Preallocated, Vectorized, and Parallelized: {} Gops\n -Single DMA: {} Gops\n -Preloaded: {} Gops\n" + + +def apply_parallel_unroll_vectorize(sch, blocks, outer_split, unroll_split, vector_split): + for block in blocks: + vb, vi = sch.get_loops(block) + v = sch.fuse(vb, vi) + vbo, vbi, vio, vii = sch.split(v, factors=[outer_split, None, unroll_split, vector_split]) + sch.vectorize(vii) + sch.unroll(vio) + sch.parallel(vbo) + return sch + + +def apply_unroll_vectorize(sch, blocks, unroll_split, vector_split): + for block in blocks: + vb, vi = sch.get_loops(block) + v = sch.fuse(vb, vi) + _, vio, vii = sch.split(v, factors=[None, unroll_split, vector_split]) + sch.vectorize(vii) + sch.unroll(vio) + return sch + + +def apply_vrmpy_parallelization(sch): + block = sch.get_block("C") + b = sch.get_loops(block) + bo, _ = sch.split(b[0], factors=[4, None]) + sch.parallel(bo) + return sch + + +def apply_vtcm_cache_read_write(sch): + block = sch.get_block("C") + sch.cache_read(block, 0, "global.vtcm") + sch.cache_read(block, 1, "global.vtcm") + sch.cache_write(block, 0, "global.vtcm") + return sch + + +def vrmpy(operations): + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128) + B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128) + C = T.match_buffer(c, [operations, 32], dtype="int32", align=128) + for n in T.grid(operations): + with T.block("C"): + vn = T.axis.remap("S", [n]) + C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), + T.uint32(2), + T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + dtype="int32x32", + ) + + return operator + + +def preloaded_vrmpy(operations): + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer( + a, + [T.cast(operations, "int32") * 128], + dtype="uint8", + align=128, + mem_scope="global.vtcm", + ) + B = T.match_buffer( + b, + [T.cast(operations, "int32") * 128], + dtype="uint8", + align=128, + mem_scope="global.vtcm", + ) + C = T.match_buffer( + c, [T.cast(operations, "int32") * 32], dtype="int32", align=128, mem_scope="global.vtcm" + ) + for n in T.grid(operations): + with T.block("C"): + vn = T.axis.remap("S", [n]) + C[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), + T.uint32(2), + T.reinterpret(A[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"), + T.reinterpret(B[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"), + dtype="int32x32", + ) + + return operator + + +def preallocated_vrmpy(operations): + size = operations * 128 + out_size = operations * 32 + + @T.prim_func + def operator( + a: T.handle, b: T.handle, c: T.handle, a_v: T.handle, b_v: T.handle, c_v: T.handle + ) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, mem_scope="global") + B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, mem_scope="global") + C = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global") + A_global_vtcm = T.match_buffer( + a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm" + ) + B_global_vtcm = T.match_buffer( + b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm" + ) + C_global_vtcm = T.match_buffer( + c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm" + ) + for n, i in T.grid(operations, 128): + with T.block("A_global.vtcm"): + vn, vi = T.axis.remap("SS", [n, i]) + A_global_vtcm[vn * 128 + vi] = A[vn, vi] + for n, i in T.grid(operations, 128): + with T.block("B_global.vtcm"): + vn, vi = T.axis.remap("SS", [n, i]) + B_global_vtcm[vn * 128 + vi] = B[vn, vi] + for n in T.grid(operations): + with T.block("C"): + vn = T.axis.remap("S", [n]) + C_global_vtcm[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), + T.uint32(2), + T.reinterpret( + A_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32" + ), + T.reinterpret( + B_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32" + ), + dtype="int32x32", + ) + for n, i in T.grid(operations, 32): + with T.block("C_global.vtcm"): + vn, vi = T.axis.remap("SS", [n, i]) + C[vn, vi] = C_global_vtcm[vn * 32 + vi] + + return operator + + +def preallocated_single_dma_vrmpy(operations): + size = operations * 128 + out_size = operations * 32 + + @T.prim_func + def operator( + a: T.handle, + b: T.handle, + c: T.handle, + a_v: T.handle, + b_v: T.handle, + c_v: T.handle, + ) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, mem_scope="global") + B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, mem_scope="global") + C = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global") + A_global_vtcm = T.match_buffer( + a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm" + ) + B_global_vtcm = T.match_buffer( + b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm" + ) + C_global_vtcm = T.match_buffer( + c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm" + ) + T.evaluate( + T.tvm_call_packed( + "device_api.hexagon.mem_copy_DLTensor", + T.tvm_stack_make_array( + A_global_vtcm.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + A_global_vtcm.dtype, + 0, + dtype="handle", + ), + T.tvm_stack_make_array( + A.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + A.dtype, + 0, + dtype="handle", + ), + T.cast(size, dtype="int"), + dtype="int32", + ) + ) + T.evaluate( + T.tvm_call_packed( + "device_api.hexagon.mem_copy_DLTensor", + T.tvm_stack_make_array( + B_global_vtcm.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + B_global_vtcm.dtype, + 0, + dtype="handle", + ), + T.tvm_stack_make_array( + B.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + B.dtype, + 0, + dtype="handle", + ), + T.cast(size, dtype="int"), + dtype="int32", + ) + ) + for n in T.grid(operations): + with T.block("C"): + vn = T.axis.remap("S", [n]) + C_global_vtcm[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), + T.uint32(2), + T.reinterpret( + A_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32" + ), + T.reinterpret( + B_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32" + ), + dtype="int32x32", + ) + T.evaluate( + T.tvm_call_packed( + "device_api.hexagon.mem_copy_DLTensor", + T.tvm_stack_make_array( + C.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + C.dtype, + 0, + dtype="handle", + ), + T.tvm_stack_make_array( + C_global_vtcm.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + C_global_vtcm.dtype, + 0, + dtype="handle", + ), + T.cast(size, dtype="int"), + dtype="int32", + ) + ) + + return operator + + +def evaluate_result(operations, tag, time, result, expected_output): + transfer_mb = round(3 * operations * 128 / 1e6, 2) + gops = round(operations * 128 * 3 / time.mean / 1e9, 3) + mean_ms = round(time.mean * 1000, 6) + + print("\ntest_{}MB_{} took {} ms @ GOPS: {}".format(transfer_mb, tag, mean_ms, gops)) + tvm.testing.assert_allclose(result, expected_output) + + +def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global"): + target_hexagon = tvm.target.hexagon("v69") + func_tir = tvm.build( + sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon) + ) + module = hexagon_session.load_module(func_tir) + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope=mem_scope) + b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device, mem_scope=mem_scope) + c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device, mem_scope=mem_scope) + timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10) + time = timer(a_hexagon, b_hexagon, c_hexagon) + gops = round(operations * 128 * 3 / time.mean / 1e9, 4) + return gops, c_hexagon.asnumpy() + + +def setup_and_run_preallocated(hexagon_session, sch, a, b, c, operations): + target_hexagon = tvm.target.hexagon("v69") + func_tir = tvm.build( + sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon) + ) + module = hexagon_session.load_module(func_tir) + + a_vtcm = np.zeros((a.size), dtype="uint8") + b_vtcm = np.zeros((b.size), dtype="uint8") + c_vtcm = np.zeros((c.size), dtype="int32") + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope="global") + b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device, mem_scope="global") + c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device, mem_scope="global") + a_vtcm_hexagon = tvm.runtime.ndarray.array( + a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm" + ) + b_vtcm_hexagon = tvm.runtime.ndarray.array( + b_vtcm, device=hexagon_session.device, mem_scope="global.vtcm" + ) + c_vtcm_hexagon = tvm.runtime.ndarray.array( + c_vtcm, device=hexagon_session.device, mem_scope="global.vtcm" + ) + + timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10) + time = timer(a_hexagon, b_hexagon, c_hexagon, a_vtcm_hexagon, b_vtcm_hexagon, c_vtcm_hexagon) + gops = round(operations * 128 * 3 / time.mean / 1e9, 4) + return gops, c_hexagon.asnumpy() + + +@tvm.testing.fixture +def input_a(operations): + return default_rng().integers(0, 16, (operations, 128), dtype="uint8") + + +@tvm.testing.fixture +def input_b(operations): + return default_rng().integers(0, 16, (operations, 128), dtype="uint8") + + +@tvm.testing.fixture +def input_c(operations): + return np.zeros((operations, 32), dtype="int32") + + +@tvm.testing.fixture +def expected_output(operations, input_a, input_b, input_c): + expected_output = np.zeros(input_c.shape, dtype="int32") + for n in range(operations): + for i in range(32): + for r in range(4): + expected_output[n, i] = expected_output[n, i] + np.uint32( + input_a[n, i * 4 + r] + ) * np.uint32(input_b[n, i * 4 + r]) + return expected_output + + +class TestMatMulVec: + + operations = tvm.testing.parameter( + 1024, + 2048, + 4096, + 5 * 2048, # 3.93MB of total transfer + # 16384, #Only works on 8Gen1 HDK's + # 5 * 4096, # 7.86MB of total transfer. Only works on 8Gen1 HDK's + ) + + # Experimentally best configurations for the memcopy + outer_split = tvm.testing.parameter(4) + unroll_split = tvm.testing.parameter(8) + vector_split = tvm.testing.parameter(64) + c_vector_split = tvm.testing.parameter(16) + c_vector_split_unallocated = tvm.testing.parameter(8) + + @tvm.testing.requires_hexagon + def test_loading_vtcm_for_vrmpy( + self, + hexagon_session, + operations, + input_a, + input_b, + input_c, + expected_output, + outer_split, + unroll_split, + vector_split, + c_vector_split, + c_vector_split_unallocated, + ): + + # Run parallel vrmpy without loading to VTCM. + sch = tvm.tir.Schedule(vrmpy(operations)) + sch = apply_vrmpy_parallelization(sch) + base_runtime, result = setup_and_run( + hexagon_session, sch, input_a, input_b, input_c, operations + ) + tvm.testing.assert_allclose(result, expected_output) + + # Run parallel vrmpy with basic memory loads to VTCM. + sch = tvm.tir.Schedule(vrmpy(operations)) + sch = apply_vtcm_cache_read_write(sch) + sch = apply_vrmpy_parallelization(sch) + basic_load_runtime, result = setup_and_run( + hexagon_session, sch, input_a, input_b, input_c, operations + ) + tvm.testing.assert_allclose(result, expected_output) + + # Run parallel vrmpy with vectorized memory loads to VTCM. + sch = tvm.tir.Schedule(vrmpy(operations)) + sch = apply_vtcm_cache_read_write(sch) + sch = apply_vrmpy_parallelization(sch) + sch = apply_unroll_vectorize( + sch, + [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")], + unroll_split, + vector_split, + ) + sch = apply_unroll_vectorize( + sch, [sch.get_block("C_global.vtcm")], unroll_split, c_vector_split_unallocated + ) + vectorized_runtime, result = setup_and_run( + hexagon_session, sch, input_a, input_b, input_c, operations + ) + tvm.testing.assert_allclose(result, expected_output) + + # Run parallel vrmpy with vectorized and parallelized memory loads to VTCM. + sch = tvm.tir.Schedule(vrmpy(operations)) + sch = apply_vtcm_cache_read_write(sch) + sch = apply_vrmpy_parallelization(sch) + sch = apply_parallel_unroll_vectorize( + sch, + [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")], + outer_split, + unroll_split, + vector_split, + ) + sch = apply_parallel_unroll_vectorize( + sch, + [sch.get_block("C_global.vtcm")], + outer_split, + unroll_split, + c_vector_split_unallocated, + ) + vectorized_parallelized_runtime, result = setup_and_run( + hexagon_session, sch, input_a, input_b, input_c, operations + ) + tvm.testing.assert_allclose(result, expected_output) + + # Run parallel vrmpy with preallocated and vectorized memory loads to VTCM. + sch = tvm.tir.Schedule(preallocated_vrmpy(operations)) + sch = apply_vrmpy_parallelization(sch) + sch = apply_unroll_vectorize( + sch, + [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")], + unroll_split, + vector_split, + ) + sch = apply_unroll_vectorize( + sch, [sch.get_block("C_global.vtcm")], unroll_split, c_vector_split + ) + preallocated_vectorized_runtime, result = setup_and_run_preallocated( + hexagon_session, sch, input_a, input_b, input_c, operations + ) + result = result.reshape((operations, 32)) + tvm.testing.assert_allclose(result, expected_output) + + # Run parallel vrmpy with preallocated, vectorized, and parallelized memory loads to VTCM. + sch = tvm.tir.Schedule(preallocated_vrmpy(operations)) + sch = apply_vrmpy_parallelization(sch) + sch = apply_parallel_unroll_vectorize( + sch, + [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")], + outer_split, + unroll_split, + vector_split, + ) + sch = apply_parallel_unroll_vectorize( + sch, [sch.get_block("C_global.vtcm")], outer_split, unroll_split, c_vector_split + ) + preallocated_vectorized_parallelized_runtime, result = setup_and_run_preallocated( + hexagon_session, sch, input_a, input_b, input_c, operations + ) + result = result.reshape((operations, 32)) + tvm.testing.assert_allclose(result, expected_output) + + # Run parallel vrmpy with preallocated single dma memory load to VTCM. + sch = tvm.tir.Schedule(preallocated_single_dma_vrmpy(operations)) + sch = apply_vrmpy_parallelization(sch) + single_dma_runtime, result = setup_and_run_preallocated( + hexagon_session, sch, input_a, input_b, input_c, operations + ) + result = result.reshape((operations, 32)) + tvm.testing.assert_allclose(result, expected_output) + + # Run parallel vrmpy with data preloaded in VTCM. + sch = tvm.tir.Schedule(preloaded_vrmpy(operations)) + sch = apply_vrmpy_parallelization(sch) + input_a = input_a.reshape(operations * 128) + input_b = input_b.reshape(operations * 128) + input_c = input_c.reshape(operations * 32) + preloaded_runtime, result = setup_and_run( + hexagon_session, sch, input_a, input_b, input_c, operations, "global.vtcm" + ) + result = result.reshape((operations, 32)) + tvm.testing.assert_allclose(result, expected_output) + + transfer_mb = round(3 * operations * 128 / 1e6, 2) + print( + TEST_OUTPUT_TEMPLATE.format( + transfer_mb, + base_runtime, + basic_load_runtime, + vectorized_runtime, + vectorized_parallelized_runtime, + preallocated_vectorized_runtime, + preallocated_vectorized_parallelized_runtime, + single_dma_runtime, + preloaded_runtime, + ) + ) diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py new file mode 100644 index 000000000000..6db8b9101997 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py @@ -0,0 +1,169 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Test theoretical bandwith for data transfers to VTCM for different strategies.""" + +import numpy as np +from tests.python.contrib.test_hexagon.infrastructure import allocate_hexagon_array +import tvm + +from tvm.script import tir as T +from numpy.random import default_rng + +MB = 1024**2 +KB = 1024 +TEST_OUTPUT_TEMPLATE = "Test bandwidth with buffer size {}MB... \n -Base: {} GBps \n -Vectorized: {} GBps\n -Vectorized and Parallelized: {} GBps\n -Single DMA Copy: {} GBps\n" + + +def memcopy_operator(size): + @T.prim_func + def operator(a: T.handle, a_v: T.handle) -> None: + A = T.match_buffer(a, size, dtype="int8", align=128, scope="global") + A_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm") + for ax0 in T.serial(size): + with T.block("A_global.vtcm"): + v0 = T.axis.spatial(size, ax0) + T.reads(A[v0]) + T.writes(A_global_vtcm[v0]) + A_global_vtcm[v0] = A[v0] + + return operator + + +def single_dma_operator(size): + @T.prim_func + def operator(a: T.handle, a_v: T.handle) -> None: + A = T.match_buffer(a, size, dtype="int8", align=128, scope="global") + A_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm") + T.evaluate( + T.tvm_call_packed( + "device_api.hexagon.mem_copy_DLTensor", + T.tvm_stack_make_array( + A_global_vtcm.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + A_global_vtcm.dtype, + 0, + dtype="handle", + ), + T.tvm_stack_make_array( + A.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + A.dtype, + 0, + dtype="handle", + ), + T.cast(size, dtype="int"), + dtype="int32", + ) + ) + + return operator + + +def evaluate(hexagon_session, sch, size): + a_shape = size + + target_hexagon = tvm.target.hexagon("v69") + func_tir = tvm.build( + sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon) + ) + module = hexagon_session.load_module(func_tir) + + rng = default_rng() + a = rng.integers(-128, 127, a_shape, dtype="int8") + a_vtcm = np.zeros(a_shape, dtype="int8") + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope="global") + a_vtcm_hexagon = tvm.runtime.ndarray.array( + a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm" + ) + + # a_hexagon = allocate_hexagon_array(hexagon_session.device, data=a, mem_scope="global") + # a_vtcm_hexagon = allocate_hexagon_array(hexagon_session.device, data=a_vtcm, mem_scope="global.vtcm") + + timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10) + runtime = timer(a_hexagon, a_vtcm_hexagon) + + gbps = round((size / 2**30) / runtime.mean, 4) + tvm.testing.assert_allclose(a_vtcm_hexagon.asnumpy(), a) + + return gbps + + +class TestMatMulVec: + + size = tvm.testing.parameter( + 10 * KB, + 20 * KB, + 40 * KB, + 80 * KB, + 160 * KB, + 320 * KB, + 640 * KB, + MB, + 2 * MB, + 3 * MB, + 4 * MB, + # 8 * MB, # Only works on 8gen1 HDKs + ) + + outer_split = tvm.testing.parameter(4) + unroll_split = tvm.testing.parameter(2) + vector_split = tvm.testing.parameter(128) + + @tvm.testing.requires_hexagon + def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vector_split): + + # Run the base memcopy operator. + sch = tvm.tir.Schedule(memcopy_operator(size)) + base_gpbs = evaluate(hexagon_session, sch, size) + + # Run with some basic unroll and vectorize scheduling. + sch = tvm.tir.Schedule(memcopy_operator(size)) + vtcm_block_a = sch.get_block("A_global.vtcm") + vb = sch.get_loops(vtcm_block_a) + vbi_a, vio_a, vii_a = sch.split(vb[0], factors=[None, unroll_split, vector_split]) + sch.unroll(vio_a) + sch.vectorize(vii_a) + vectorize_gbps = evaluate(hexagon_session, sch, size) + + # Run with some basic unroll and vectorize scheduling and parallelization. + sch = tvm.tir.Schedule(memcopy_operator(size)) + vtcm_block_a = sch.get_block("A_global.vtcm") + vb = sch.get_loops(vtcm_block_a) + vbo_a, vbi_a, vio_a, vii_a = sch.split( + vb[0], factors=[outer_split, None, unroll_split, vector_split] + ) + sch.unroll(vio_a) + sch.vectorize(vii_a) + sch.parallel(vbo_a) + parallel_gbps = evaluate(hexagon_session, sch, size) + + # Run using a single dma copy to transfer the data. + sch = tvm.tir.Schedule(single_dma_operator(size)) + single_dma_gbps = evaluate(hexagon_session, sch, size) + + mbs = round(size / MB, 2) + print( + TEST_OUTPUT_TEMPLATE.format( + mbs, base_gpbs, vectorize_gbps, parallel_gbps, single_dma_gbps + ) + ) From 64635b7f372f229f4179806bf65e83f45e9ab856 Mon Sep 17 00:00:00 2001 From: Ziheng Jiang Date: Tue, 13 Sep 2022 14:43:23 -0700 Subject: [PATCH 158/704] [COMMUNITY] Josh Fromm -> PMC (#12768) --- CONTRIBUTORS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 2231fac66596..42f67e87df10 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -36,7 +36,7 @@ We do encourage everyone to work anything they are interested in. - [Wei Chen](https://github.com/wweic): @wweic - runtime, relay, vm - [Zhi Chen](https://github.com/zhiics) (PMC): @zhiics - relay, quantization, pass manager - [Siyuan Feng](https://github.com/Hzfengsy) (PMC): @Hzfengsy - tir -- [Josh Fromm](https://github.com/jwfromm): @jwfromm - frontends, quantization, topi +- [Josh Fromm](https://github.com/jwfromm) (PMC): @jwfromm - frontends, quantization, topi - [Mehrdad Hessar](https://github.com/mehrdadh): @mehrdadh - microTVM, hexagon - [Bohan Hou](https://github.com/spectrometerHBH): @spectrometerHBH - tir, arith, tvm-script - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends From ab8fe34c8e0a73ceb886e95616417281019c4d1d Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Tue, 13 Sep 2022 18:24:33 -0600 Subject: [PATCH 159/704] [FQ2I] Quantized constant bias (#12666) * support fp32 constants in quantized bias add * add a test * clean up comment * assert the bias is floating point as well as constant before requantizing --- .../transform/fake_quantization_to_integer.py | 43 +++++++++++-------- .../test_pass_fake_quantization_to_integer.py | 15 +++++-- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py index bb874c131cd8..242740399f96 100644 --- a/python/tvm/relay/transform/fake_quantization_to_integer.py +++ b/python/tvm/relay/transform/fake_quantization_to_integer.py @@ -79,7 +79,6 @@ def quantize(expr, type_map): out_dtype=expr.attrs.out_dtype, axis=t.axis, ) - return [ out, TensorAffineType(expr.args[1], expr.args[2], expr.attrs.out_dtype, expr.attrs.axis), @@ -204,23 +203,30 @@ def bias_add(expr, type_map): """Rewrite a bias_add op""" x, b = expr.args x_t = type_map[x] - b_t = type_map[b] - in_scale = fold_constant(x_t.scale) - in_zero_point = fold_constant(x_t.zero_point) - if not ( - approx_equal(x_t.scale, b_t.scale) - and approx_equal(x_t.zero_point, b_t.zero_point) - and tvm.ir.structural_equal(x_t.dtype, b_t.dtype) - ): - b = relay.qnn.op.requantize( - b, - b_t.scale, - b_t.zero_point, - in_scale, - in_zero_point, - out_dtype=x_t.dtype, - axis=0, - ) + if b in type_map: + # Ensure bias matches the previous op + b_t = type_map[b] + in_scale = fold_constant(x_t.scale) + in_zero_point = fold_constant(x_t.zero_point) + if not ( + approx_equal(x_t.scale, b_t.scale) + and approx_equal(x_t.zero_point, b_t.zero_point) + and tvm.ir.structural_equal(x_t.dtype, b_t.dtype) + ): + b = relay.qnn.op.requantize( + b, + b_t.scale, + b_t.zero_point, + in_scale, + in_zero_point, + out_dtype=x_t.dtype, + axis=0, + ) + else: + # If the bias is a constant, we need to quantize it + assert isinstance(b, relay.expr.Constant) + assert b.checked_type.dtype in ["float32", "float64", "float16", "bfloat16"] + b = relay.qnn.op.quantize(b, x_t.scale, x_t.zero_point, axis=0, out_dtype=x_t.dtype) out = relay.op.nn.bias_add(x, b, **expr.attrs) return [out, x_t] @@ -431,6 +437,7 @@ def pad(expr, type_map): else: # If the pad-value is a constant, we need to quantize it assert isinstance(pad_value, relay.expr.Constant) + assert pad_value.checked_type.dtype in ["float32", "float64", "float16", "bfloat16"] pad_value = relay.qnn.op.quantize(pad_value, t.scale, t.zero_point) out = relay.op.nn.pad(arg, pad_value=pad_value, **expr.attrs) diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py index a63d82e68750..46979dfc3cba 100644 --- a/tests/python/relay/test_pass_fake_quantization_to_integer.py +++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py @@ -192,26 +192,33 @@ def test_fake_transpose_quantize_conv(): compare_fq_to_int(op, [x_np, w_np]) -def test_fake_transpose_quantize_conv_bias_add(): +@pytest.mark.parametrize("const_bias", [False, True]) +def test_fake_transpose_quantize_conv_bias_add(const_bias): x = relay.var("x", shape=[1, 224, 224, 3], dtype="int8") w = relay.var("w", shape=[16, 3, 5, 5], dtype="int8") - bias = relay.var("bias", shape=[16], dtype="int32") one = relay.const(1.0) zero = relay.const(0) + if const_bias: + bias = relay.const(np.random.random(16).astype("float32")) + else: + bias = relay.qnn.op.dequantize(relay.var("bias", shape=[16], dtype="int32"), one, zero) x = relay.qnn.op.dequantize(x, relay.const(2.0), zero) x = relay.transpose(x, [0, 3, 1, 2]) op = relay.op.nn.conv2d( x, relay.qnn.op.dequantize(w, relay.const(0.5), zero), kernel_size=[5, 5] ) - op = relay.op.nn.bias_add(op, relay.qnn.op.dequantize(bias, one, zero)) + op = relay.op.nn.bias_add(op, bias) op = relay.qnn.op.quantize(op, one, zero) x_np = np.random.randint(-128, 127, size=[1, 224, 224, 3], dtype="int8") w_np = np.random.randint(-128, 127, size=[16, 3, 5, 5], dtype="int8") bias_np = np.random.randint(-32768, 32767, size=[16], dtype="int32") + args = [x_np, w_np] - compare_fq_to_int(op, [x_np, w_np, bias_np]) + if not const_bias: + args.append(bias_np) + compare_fq_to_int(op, args) def test_fake_transpose_quantize_conv_bias_add_per_channel(): From 91bd9a3fec0dfc419e739d12ee098d0bc39f763d Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Tue, 13 Sep 2022 21:43:54 -0700 Subject: [PATCH 160/704] [Hybrid] Fix handling AST subcription for Python3.9 (#12769) fixed https://github.com/apache/tvm/issues/9955, this is covered by the existing test case `tests/python/relay/test_op_level3.py::test_unique` --- python/tvm/te/hybrid/parser.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/tvm/te/hybrid/parser.py b/python/tvm/te/hybrid/parser.py index 1e1e4c50f7b9..4956aaf0be32 100644 --- a/python/tvm/te/hybrid/parser.py +++ b/python/tvm/te/hybrid/parser.py @@ -374,6 +374,10 @@ def visit_Attribute(self, node): def visit_Subscript(self, node): args = self.visit(node.slice) + if sys.version_info > (3, 8): + if not isinstance(node.slice, ast.Tuple): + args = [args] + arr = self.visit(node.value) if isinstance(arr, Array): for i in args: From f7f2cda6756c170755fc18cbe23f6bf4a4b0d584 Mon Sep 17 00:00:00 2001 From: Matthew Barrett <55580676+mbaret@users.noreply.github.com> Date: Wed, 14 Sep 2022 10:25:45 +0100 Subject: [PATCH 161/704] [AOT] Add AOTLowerMain pass to lower a Relay main into TIR (#12550) This is a pass refactored out of the AOTExecutorCodegen. Instead of combining all of the functionality of the AOTExecutorCodegen into a single monolithic pass, this pass only handles the lowering of the Relay main function into TIR. Tests for the pass are included. --- CMakeLists.txt | 1 + python/tvm/relay/backend/_aot.py | 21 + python/tvm/relay/backend/aot.py | 43 + python/tvm/relay/backend/utils.py | 7 + src/relay/backend/aot/aot_lower_main.cc | 861 ++++++++++++++++++ src/relay/backend/aot/aot_lower_main.h | 58 ++ src/relay/backend/utils.cc | 28 +- src/relay/backend/utils.h | 74 ++ .../relay/backend/aot/aot_lower_main_test.cc | 63 ++ .../relay/aot/test_pass_aot_lower_main.py | 429 +++++++++ 10 files changed, 1572 insertions(+), 13 deletions(-) create mode 100644 python/tvm/relay/backend/_aot.py create mode 100644 python/tvm/relay/backend/aot.py create mode 100644 src/relay/backend/aot/aot_lower_main.cc create mode 100644 src/relay/backend/aot/aot_lower_main.h create mode 100644 tests/cpp/relay/backend/aot/aot_lower_main_test.cc create mode 100644 tests/python/relay/aot/test_pass_aot_lower_main.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 8995f9a87fb7..7c355238b8c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -305,6 +305,7 @@ tvm_file_glob(GLOB_RECURSE RELAY_PASS_SRCS tvm_file_glob(GLOB RELAY_BACKEND_SRCS src/relay/backend/*.cc src/relay/backend/vm/*.cc + src/relay/backend/aot/*.cc ) tvm_file_glob(GLOB_RECURSE RELAY_IR_SRCS src/relay/ir/*.cc diff --git a/python/tvm/relay/backend/_aot.py b/python/tvm/relay/backend/_aot.py new file mode 100644 index 000000000000..437cd71c4c35 --- /dev/null +++ b/python/tvm/relay/backend/_aot.py @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""The AOT FFI namespace. +""" +import tvm._ffi + +tvm._ffi._init_api("relay.backend.aot", __name__) diff --git a/python/tvm/relay/backend/aot.py b/python/tvm/relay/backend/aot.py new file mode 100644 index 000000000000..8e7406c72f32 --- /dev/null +++ b/python/tvm/relay/backend/aot.py @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name +"""AOT passes""" +from tvm.ir.transform import Pass +from .utils import CallType + +from . import _aot + + +def AOTLowerMain(mod_name: str, config: object, call_type: CallType) -> Pass: + """Lower a Relay main function into an AOT TIR main function. + + Parameters + ---------- + mod_name: str + The name of the module. + config : CompilationConfig + The compilation configuration. + call_type : CallType + The calling convention to use. + + Returns + ------- + Pass + The AOTLowerMain pass. + + """ + return _aot.AOTLowerMain(mod_name, config, call_type.value) diff --git a/python/tvm/relay/backend/utils.py b/python/tvm/relay/backend/utils.py index b8430a9e6b6e..7289dbbc4af4 100644 --- a/python/tvm/relay/backend/utils.py +++ b/python/tvm/relay/backend/utils.py @@ -15,6 +15,13 @@ # specific language governing permissions and limitations # under the License. """Utility backend functions.""" +from enum import Enum + + +class CallType(Enum): + Packed = 0 + CPacked = 1 + Unpacked = 2 def _is_valid_modname(mod_name): diff --git a/src/relay/backend/aot/aot_lower_main.cc b/src/relay/backend/aot/aot_lower_main.cc new file mode 100644 index 000000000000..ce72595dc10b --- /dev/null +++ b/src/relay/backend/aot/aot_lower_main.cc @@ -0,0 +1,861 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/backend/aot/aot_lower_main.cc + * \brief Lower the Relay main func into an AOT TIR main func. + */ +#include "./aot_lower_main.h" + +#include +#include + +#include "../../op/call/call.h" +#include "../../op/memory/device_copy.h" +#include "../../op/memory/memory.h" +#include "../../transforms/device_aware_visitors.h" +#include "../name_transforms.h" +#include "../utils.h" + +namespace tvm { +namespace relay { +namespace backend { +namespace aot { + +/*! + * \brief Looks at the expressions in a given function and produces an Expr to + * StorageInfo map by assigning one or more StorageInfos to the expressions that + * require storage. + * + * This pass is leveraged by AOTMainLowerer to perform an initial naive allocation + * for tensors in the Relay main function. The resulting storage map is then lowered + * into TIR allocations by AOTMainLowerer where the allocation can be subsequently + * optimized by later passes (e.g. USMP). + */ +class ExprAllocator : public transform::DeviceAwareExprVisitor { + public: + ExprAllocator() : transform::DeviceAwareExprVisitor(Optional()) {} + + // run the visitor on a global function. + void Run(const Function& func) { VisitExpr(func); } + + std::vector GetReturnSIDs() const { return return_sids_; } + + StorageMap GetStorageMap() const { return expr_storage_map_; } + + using ExprVisitor::VisitExpr_; + + void DeviceAwareVisitExpr_(const CallNode* call_node) final { + Array args; + + CallLoweredProps call_lowered_props = GetCallLoweredProps(call_node); + if (call_lowered_props.lowered_func.defined()) { + args = call_lowered_props.arguments; + } else { // Relay functions that have not been lowered and lowered extern functions + args = call_node->args; + if (call_node->op.as()) { // Lowered extern function + ICHECK(!(call_node->attrs.defined())) << "Extern functions should have null attributes."; + } else { // Relay function which has not been lowered yet + ICHECK(call_node->op.as()) + << "Expected the call to be to a lowered primfunc, a lowered extern function or a " + "unlowered Relay function."; + } + } + CreateStorage(call_node); + for (const Expr& arg : args) { + VisitExpr(arg); + } + AssignReturnSID(GetRef(call_node)); + } + + void DeviceAwareVisitExpr_(const FunctionNode* func_node) final { + if (function_nesting() > 1) { + // Do not recurse into sub functions. + return; + } + for (const auto& param : func_node->params) { + CreateStorage(param.get()); + } + VisitExpr(func_node->body); + } + + void PreVisitLetBinding_(const Var& var, const Expr& value) final { + VisitExpr(value); + StorageInfo si = GetStorage(value); + expr_storage_map_[var] = si; + } + + void VisitExpr_(const ConstantNode* op) final { + CreateStorage(op); + AssignReturnSID(GetRef(op)); + } + + void VisitExpr_(const VarNode* op) final { AssignReturnSID(GetRef(op)); } + + void VisitExpr_(const TupleNode* op) final { + std::vector storage_ids; + std::vector virtual_devices; + std::vector storage_sizes_in_bytes; + Expr expr = GetRef(op); + for (Expr field : op->fields) { + auto sid = GetStorage(field); + storage_ids.insert(storage_ids.end(), sid->storage_ids.begin(), sid->storage_ids.end()); + virtual_devices.insert(virtual_devices.end(), sid->virtual_devices.begin(), + sid->virtual_devices.end()); + storage_sizes_in_bytes.insert(storage_sizes_in_bytes.end(), + sid->storage_sizes_in_bytes.begin(), + sid->storage_sizes_in_bytes.end()); + } + expr_storage_map_[expr] = StorageInfo(storage_ids, virtual_devices, storage_sizes_in_bytes); + AssignReturnSID(expr); + } + + void VisitExpr_(const TupleGetItemNode* op) final { + Expr expr = GetRef(op); + auto sids = GetStorage(op->tuple); + ICHECK_LT(static_cast(op->index), sids->storage_ids.size()); + expr_storage_map_[expr] = + StorageInfo({sids->storage_ids[op->index]}, {sids->virtual_devices[op->index]}, + {sids->storage_sizes_in_bytes[op->index]}); + AssignReturnSID(expr); + } + + void VisitExpr_(const IfNode* op) final { LOG(FATAL) << "'If' is not supported."; } + + private: + /*! + * \brief Assign the expression's storage IDs as the return storage IDs. + * \note This is called when visiting every expression on the understanding + * that the returned expression will be visited last. + */ + void AssignReturnSID(const Expr& e) { + if (expr_storage_map_.find(e) != expr_storage_map_.end()) { + StorageInfo& sinfo = expr_storage_map_[e]; + return_sids_.clear(); + for (auto sid : sinfo->storage_ids) { + return_sids_.push_back(sid); + } + } + } + + /*! + * \brief Get the necessary storage for the expression. + * \param expr The expression. + * \return The corresponding token. + */ + StorageInfo GetStorage(const Expr& expr) { + // See through "on_device" calls. + Expr true_expr = IgnoreOnDevice(expr); + VisitExpr(true_expr); + auto it = expr_storage_map_.find(true_expr); + ICHECK(it != expr_storage_map_.end()) << "Could not find " << true_expr->GetTypeKey() << " " + << PrettyPrint(true_expr) << " in storage device map"; + return it->second; + } + + /*! + * \brief Create storage for the expression. + */ + void CreateStorage(const ExprNode* op) { + Expr expr = GetRef(op); + return CreateStorage(expr, GetVirtualDevice(expr)); + } + + /*! + * \brief Create storage to hold the result of evaluating \p expr in \p virtual_device. + */ + void CreateStorage(const Expr& expr, const VirtualDevice& virtual_device) { + ICHECK(!virtual_device->IsFullyUnconstrained()) + << "invalid virtual device for expr:" << std::endl + << PrettyPrint(expr); + std::vector storage_ids; + std::vector virtual_devices; + std::vector storage_sizes_in_bytes; + for (const auto& ttype : FlattenTupleType(expr->checked_type())) { + storage_ids.push_back(next_available_sid_++); + virtual_devices.push_back(virtual_device); + storage_sizes_in_bytes.push_back(GetMemorySizeBytes(ttype->shape, ttype->dtype)); + } + expr_storage_map_[expr] = StorageInfo(std::move(storage_ids), std::move(virtual_devices), + std::move(storage_sizes_in_bytes)); + } + + /*! \brief Map between Exprs and StorageInfos */ + StorageMap expr_storage_map_; + /*! \brief The next available storage ID to be used */ + int next_available_sid_{0}; + /*! \brief The storage IDs that correspond to return values */ + std::vector return_sids_; +}; + +std::tuple> CreateStorage(const Function& func) { + ExprAllocator expr_allocator; + expr_allocator.Run(func); + return std::make_tuple(expr_allocator.GetStorageMap(), expr_allocator.GetReturnSIDs()); +} + +class AOTMainLowerer : public MixedModeVisitor { + public: + AOTMainLowerer(tvm::CompilationConfig config, CallType call_type) + : config_(config), call_type_(call_type) {} + + IRModule Lower(IRModule mod, String mod_name) { + VLOG_CONTEXT << "AOT"; + IRModule lowered_mod = GetRef(mod.CopyOnWrite()); + + auto lowered_main = lowered_mod->Lookup("main"); + auto lowered_main_func = GetRef(lowered_main.as()); + + // Assign StorageInfo to all the Relay exprs and get the return SIDs + std::tie(expr_storage_map_, return_sid_) = CreateStorage(lowered_main_func); + + for (auto input : lowered_main_func->params) { + input_vars_.push_back(input); + std::string input_name = SanitizeName(input->name_hint()); + // We don't want the compiler changing input names in the + // event of a sanitization collision. Therefore, enforcing + // the var created to use the input_name strictly. + CreateIOVar(input, input_name, /*use_unique_name = */ false); + } + + // Define the storage allocator ids + for (auto kv : expr_storage_map_) { + for (auto sid : kv.second->storage_ids) { + // The buffer_var is created with storage_scope to be global.workspace to be serviced by + // TVMBackendAllocWorkspace(TVMBAW) calls, explicitly. The reasoning being the executor + // allocates should be serviced by TVMBAWs as the data could be accessed by many devices and + // should not be lowered to the stack. For more details please refer to the discussion here: + // https://github.com/apache/tvm/issues/9022 + tir::Var buffer_var(MakeString("sid_", sid), + PointerType(PrimType(DataType::Int(8)), "global.workspace")); + sids_table_[sid] = buffer_var; + } + } + + // Create output vars for the TIR main func + // If output tensor names were provided use them + if (auto opt = lowered_main->GetAttr>("output_tensor_names")) { + Array output_tensor_names = opt.value(); + Expr output_expr = lowered_main_func->body; + if (output_expr->checked_type()->IsInstance()) { + TupleType output_tuple_type = Downcast(output_expr->checked_type()); + for (unsigned i = 0; i < output_tuple_type->fields.size(); i++) { + // AoT Executor Codegen does not create these names, + // thus should be used as they are provided. + CreateIOVar(output_tuple_type->fields[i], output_tensor_names[i], + /*use_unique_name = */ false); + } + } else { + // AoT Executor Codegen does not create these names, + // thus should be used as they are provided. + CreateIOVar(lowered_main_func->body, output_tensor_names[0], /*use_unique_name = */ false); + } + } else { + // If output tensor names are not provided we will generate output(x) + // where x is a counter to create unique names. + if (lowered_main_func->body->checked_type()->IsInstance()) { + CreateIOVar(lowered_main_func->body, "output"); + } else { + CreateIOVar(lowered_main_func->body, "output", /*use_unique_name = */ false); + } + } + + CollectDeviceVariables(lowered_mod->GetAttr>("device_contexts") + .value_or(Map())); + VisitExpr(lowered_main_func->body); + + // Remove the Relay main and replace it with the lowered TIR version + lowered_mod->Remove(lowered_mod->GetGlobalVar("main")); + auto tir_main_func = CreateMainFunc(mod_name); + lowered_mod->Update(GlobalVar(runtime::symbol::tvm_module_main), tir_main_func); + lowered_mod = tir::transform::RemoveNoOp()(lowered_mod); + return lowered_mod; + } + + void VisitExpr_(const CallNode* call_node) override { + OnDeviceProps on_device_props = GetOnDeviceProps(call_node); + if (on_device_props.body.defined()) { + VisitExpr(on_device_props.body); + return; + } + + DeviceCopyProps device_copy_props = GetDeviceCopyProps(call_node); + CallLoweredProps call_lowered_props = GetCallLoweredProps(call_node); + + if (device_copy_props.body.defined()) { + // TODO(mbs): device_copy cleaunp + // Suspect treating as no-op is better since already built into the StorageInfo? + LOG(FATAL) << "The AOT executor does not currently support device_copy"; + return; + } + + // At this point we should only see calls of the form call_lowered(@callee, (args...)), + // where @callee can be a PrimFunc we've compiled or an external function supplied via + // some other mechanism. + ICHECK(call_lowered_props.lowered_func.defined()) + << "AOT does not support calling Relay functions. Attempting to call:" << std::endl + << PrettyPrint(GetRef(call_node)); + for (const auto& arg : call_lowered_props.arguments) { + // Evaluate the args + VisitExpr(arg); + } + CreateFuncCall(call_lowered_props, GetRef(call_node)); + } + + void VisitExpr_(const VarNode* op) override { + Expr expr = GetRef(op); + StorageInfo& sinfo = expr_storage_map_[expr]; + + // Let bound vars refer to a value, so these should not be considered "output" vars. + if (let_bound_vars_.find(GetRef(op)) != let_bound_vars_.end()) { + return; + } + + // If the Var node is an output node we need to copy the content of the variable to the output + // It's safe to check the SID here because Var StorageToken are never reallocated + auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sinfo->storage_ids[0]); + if (output_iter != return_sid_.end()) { + int output_index = std::distance(return_sid_.begin(), output_iter); + auto var_expr = FindExpr(expr); + CopyToOutput(GetBufferVarForIO(input_vars_.size() + output_index), var_expr[0], + /*pack_input*/ false, sinfo->storage_sizes_in_bytes[0]); + } + } + + void VisitExpr_(const ConstantNode* op) override { + Expr expr = GetRef(op); + ICHECK(expr_storage_map_.find(expr) != expr_storage_map_.end()) + << "Storage map did not contain constant expr " << PrettyPrint(expr); + StorageInfo& sinfo = expr_storage_map_[expr]; + std::stringstream ss; + ss << "constant_" << constant_map_.size(); + + tir::Var constant(ss.str(), PointerType(PrimType(DataType(op->data->dtype)))); + constant_map_[constant] = op; + auto sid = sinfo->storage_ids[0]; + sids_table_[sid] = constant; + + // If the Constant node is an output node we need to copy the content of the parameter to the + // output. A node can only produce a single output + auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid); + if (output_iter != return_sid_.end()) { + int output_index = std::distance(return_sid_.begin(), output_iter); + auto param_handle = tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::lookup_param(), + {tir::StringImm(ss.str())}); + CopyToOutput(GetBufferVarForIO(input_vars_.size() + output_index), constant, + /* pack_input */ false, sinfo->storage_sizes_in_bytes[0]); + } + } + + void VisitExpr_(const TupleNode* op) override { + for (auto field : op->fields) { + VisitExpr(field); + } + } + + void VisitExpr_(const LetNode* op) override { + auto pre_visit = [this](const LetNode* op) { + let_bound_vars_.insert(op->var); + this->VisitExpr(op->value); + }; + auto post_visit = [this](const LetNode* op) { + this->VisitExpr(op->body); + this->visit_counter_[op] += 1; + }; + ExpandANormalForm(op, pre_visit, post_visit); + } + + void VisitExpr_(const TupleGetItemNode* op) override { VisitExpr(op->tuple); } + void VisitExpr_(const OpNode* op) override { + if (GetRef(op) != CallLoweredOp() && GetRef(op) != OnDeviceOp()) { + LOG(FATAL) << "All OpNodes except for call_lowered should have been expanded"; + } + } + void VisitExpr_(const IfNode* op) override { + LOG(FATAL) << "All GlobalVarNodes should be removed before AOT executor's Codegen is called"; + } + void VisitExpr_(const FunctionNode* op) override { + ICHECK(op->GetAttr(attr::kCompiler).defined()) + << "FunctionNode only supported by custom codegen"; + } + void VisitExpr_(const RefCreateNode* op) override { + LOG(FATAL) << "AOT executor does not support references (found RefCreateNode)"; + } + void VisitExpr_(const RefReadNode* op) override { + LOG(FATAL) << "AOT executor does not support references (found RefReadNode)"; + } + void VisitExpr_(const RefWriteNode* op) override { + LOG(FATAL) << "AOT executor does not support references (found RefWriteNode)"; + } + void VisitExpr_(const ConstructorNode* op) override { + LOG(FATAL) << "AOT executor does not support ADTs (found ConstructorNode)"; + } + void VisitExpr_(const MatchNode* op) override { + LOG(FATAL) << "AOT executor does not support matching (found MatchNode)"; + } + + private: + /*! + * \brief Create the main PrimFunc to execute the graph. + * \note The packed function calls don't pack their arguments. The AOT + * runner function needs to be legalized by the LegalizePackedCalls pass. + */ + tir::PrimFunc CreateMainFunc(String mod_name) { + tir::Stmt body = tir::SeqStmt(stmts_); + // Allocate the sids + std::unordered_map allocated; + std::vector> sids_to_allocate; + + for (auto kv : expr_storage_map_) { + // Only allocate sids that are needed + const bool is_input = + (std::find(input_vars_.begin(), input_vars_.end(), kv.first) != input_vars_.end()); + if (is_input) { + continue; + } + + for (unsigned int i = 0; i < kv.second->storage_ids.size(); i++) { + sids_to_allocate.push_back( + std::make_pair(kv.second->storage_ids[i], kv.second->storage_sizes_in_bytes[i])); + } + } + + // Sort the SID allocation to make output deterministic + std::sort(sids_to_allocate.begin(), sids_to_allocate.end()); + + for (auto p : sids_to_allocate) { + int sid = p.first; + int size = p.second; + + if (std::find(return_sid_.begin(), return_sid_.end(), sid) != return_sid_.end()) { + continue; + } + + // Make sure it hasn't already been allocated, this can happen + // with let-bound var/value pairs. + if (allocated.find(sid) != allocated.end()) { + continue; + } + + allocated[sid] = constant_map_.count(sids_table_[sid]); + + // TODO(giuseros): we should allocate this once outside the PrimFunc + // so we don't pay the price of allocation for every inference + if (!allocated[sid]) { + PointerType ptype = Downcast(sids_table_[sid]->type_annotation); + DataType element_type = Downcast(ptype->element_type)->dtype; + body = tir::Allocate(sids_table_[sid], element_type, {size}, tir::const_true(), body); + } + allocated[sid] = true; + } + + for (auto kv : constant_map_) { + auto buffer_var = kv.first; + auto dtype = DataType(kv.second->data->dtype); + + int ndim = kv.second->data->ndim; + Array extents; + + for (int i = 0; i < ndim; i++) { + int shape = kv.second->data->shape[i]; + extents.push_back(tir::make_const(DataType::Int(32), shape, Span())); + } + body = tir::AllocateConst(buffer_var, dtype, extents, kv.second->data, body); + } + + // Define the PrimFunc attributes + Map dict_attrs; + String run_func_name = runtime::get_name_mangled(mod_name, runtime::symbol::tvm_module_main); + dict_attrs.Set("global_symbol", run_func_name); + dict_attrs.Set("runner_function", Bool(true)); + dict_attrs.Set(tvm::attr::kTarget, config_->host_target); + Array input_vars = + Array(main_signature_.begin(), main_signature_.begin() + input_vars_.size()); + dict_attrs.Set("input_vars", input_vars); + Array output_vars = + Array(main_signature_.begin() + input_vars_.size(), + main_signature_.begin() + input_vars_.size() + return_sid_.size()); + dict_attrs.Set("output_vars", output_vars); + + tir::Stmt device_activations = GenerateAllDeviceHook("Activate"); + tir::Stmt device_deactivations = GenerateAllDeviceHook("Deactivate"); + tir::Stmt final_body = tir::SeqStmt({device_activations, body, device_deactivations}); + + // Make the PrimFunc + return tir::PrimFunc(main_signature_, final_body, VoidType(), main_buffer_map_, {}, + DictAttrs(dict_attrs)); + } + + /*! + * \brief Collects device context variables for passing to operators + */ + void CollectDeviceVariables(const Map& device_contexts) { + Map target_contexts; + TargetKindAttrMap target_attr_map = tvm::TargetKind::GetAttrMap("use_device_api"); + + for (const auto& it : device_contexts) { + const GlobalVar& global_var = it.first; + const std::string device_context_name = it.second; + + Optional target_kind = tvm::TargetKind::Get(device_context_name); + if (!target_kind || !target_attr_map.count(target_kind.value())) { + return; + } + if (target_attr_map[target_kind.value()]) { + std::string context_name = SanitizeName(device_context_name); + tir::Var device_context_var("device_context_" + context_name, DataType::Handle()); + + auto pair = target_contexts.find(target_kind.value()); + if (pair != target_contexts.end()) { + device_context_var = (*pair).second; + } else { + main_signature_.push_back(device_context_var); + devices_.Set(context_name, device_context_var); + target_contexts.Set(target_kind.value(), device_context_var); + } + + device_contexts_.Set(global_var, device_context_var); + } + } + } + + /*! + * \brief Return a vector of variables that represents the sids for the given Relay Expr + */ + std::vector PackSid(Expr expr) { + std::vector buffer_vars; + + ICHECK(expr_storage_map_.find(expr) != expr_storage_map_.end()) + << "Storage map did not contain constant expr " << PrettyPrint(expr); + StorageInfo& sinfo = expr_storage_map_[expr]; + + // Note that an expression can have multiple sids associated with it + // e.g., returning multiple values from a function + for (auto sid : sinfo->storage_ids) { + // Determine if an sid is an output buffer + auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid); + if (output_iter != return_sid_.end()) { + int output_index = std::distance(return_sid_.begin(), output_iter); + buffer_vars.push_back(GetBufferVarForIO(input_vars_.size() + output_index)); + continue; + } + + auto sid_value = sids_table_[sid]; + buffer_vars.push_back(sid_value); + } + return buffer_vars; + } + + /*! + * \brief Given an expression return the variable(s) associated with that expression + */ + std::vector FindExpr(Expr arg) { + auto input_iter = std::find(input_vars_.begin(), input_vars_.end(), arg); + if (input_iter != input_vars_.end()) { + // Input variable + int main_index = std::distance(input_vars_.begin(), input_iter); + return {GetBufferVarForIO(main_index)}; + } else { + // Storage identifier (i.e., intermediate memory) + return PackSid(arg); + } + } + + void PushArgs(const Expr& expr, const std::vector& sids, Array* args) { + const TupleNode* t = expr.as(); + if (t != nullptr) { + CHECK_EQ(sids.size(), t->fields.size()) << "Relay tuple does not map 1:1 into TIR; AOT can't " + "handle this type of Relay Expr in a CallNode."; + } + + args->insert(args->end(), sids.begin(), sids.end()); + } + + /*! + * \brief Wraps a call_extern with a tvm_check_return annotation if required otherwise + * returns the passed Call + */ + tir::Call AddCheckReturn(tir::Call existing_call) { + Array args = {tir::make_const(DataType::Int(32, 1), 0, Span()), + tir::make_const(DataType::Int(32, 1), -1, Span()), existing_call}; + return tir::Call(DataType::Int(32), tir::builtin::tvm_check_return(), args); + } + + /*! + * \brief Create a function call + * \param call_lowered_props The lowered function and the arguments to call it with + * \param result_expr The call we got func and args from (so as to recover the storage + * ids to hold the result). + */ + void CreateFuncCall(CallLoweredProps call_lowered_props, const Expr& result_expr) { + std::string func_name = call_lowered_props.lowered_func->name_hint; + tvm::Array args{tvm::tir::StringImm(func_name)}; + std::vector create_func_call_stmts; + + // Pack the inputs + for (const Expr& arg : call_lowered_props.arguments) { + auto sids = FindExpr(arg); + PushArgs(arg, sids, &args); + } + + // Pack the return(s) value. A call node can produce multiple outputs + auto result_expr_sid = PackSid(result_expr); + PushArgs(result_expr, result_expr_sid, &args); + + GlobalVar global_var = call_lowered_props.lowered_func; + bool has_c_device_api_context = device_contexts_.count(global_var) != 0; + tir::Var device_context; + tir::Stmt func_call; + + switch (call_type_) { + case CallType::kUnpacked: { + // call_extern calling convention with optional context + if (has_c_device_api_context) { + device_context = device_contexts_.Get(global_var).value(); + args.push_back(device_context); + } + func_call = tir::Evaluate(AddCheckReturn( + tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::call_extern(), args))); + break; + } + case CallType::kCPacked: { + if (has_c_device_api_context) { + device_context = device_contexts_.Get(global_var).value(); + args.push_back(device_context); + } else { + // NOTE: LowerTVMBuiltin expects some device_context placeholder. + args.push_back(tir::make_zero(DataType::Handle())); + } + func_call = tir::Evaluate( + tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::tvm_call_cpacked(), args)); + create_func_call_stmts.push_back(func_call); + break; + } + case CallType::kPacked: { + // call_packed does not accept a device context. + CHECK(!has_c_device_api_context) << "CallType::kPacked does not accept a device context"; + func_call = tir::Evaluate(AddCheckReturn( + tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::tvm_call_packed(), args))); + create_func_call_stmts.push_back(func_call); + break; + } + default: + ICHECK(false) << "Unknown CallType: " << call_type_; + } + + ICHECK(func_call.defined()) << "Must define func_call"; + + if (has_c_device_api_context) { + func_call = tir::SeqStmt(Array({ + GenerateDeviceHook(device_context, "Open"), + func_call, + GenerateDeviceHook(device_context, "Close"), + })); + } + + tir::Stmt body = tir::SeqStmt({func_call}); + stmts_.push_back(body); + } + + /*! + * \brief Copy a variable to the output. This function is mainly used in edge cases + * when we want to return an input or a parameter. + * TODO(giuseros): we should try to avoid unnecessary copy to the output, e.g., in a + * copy-on-write fashion. + */ + void CopyToOutput(PrimExpr out, PrimExpr in, bool pack_input, size_t size) { + // Define intermediate DLTensor to load/store the data + tir::Buffer tmp_read = + tir::decl_buffer({IntImm(DataType::UInt(64), size)}, DataType::UInt(8), "tmp_read"); + tir::Buffer tmp_write = + tir::decl_buffer({IntImm(DataType::UInt(64), size)}, DataType::UInt(8), "tmp_write"); + te::Var loop_idx("i", DataType::Int(32)); + auto retval_i = tir::BufferLoad(tmp_read, {loop_idx}); + // Copy the variable from the input to the output + tir::Stmt copy = tir::For( + loop_idx, 0, tir::make_const(DataType::Int(32, 1), size, Span()), tir::ForKind::kSerial, + tir::BufferStore(tmp_write, tir::Let(tmp_read->data, in, retval_i), {loop_idx})); + stmts_.push_back(tir::LetStmt(tmp_write->data, out, copy)); + } + + /*! + * \brief Generates a call to a given hook for all Devices found for C Device API + * \param hook Name of hook to generate statements for + * \return Statement with function calls for each device + */ + tir::Stmt GenerateAllDeviceHook(const String& hook) { + std::vector device_hooks; + for (const auto& it : devices_) { + const String& device_name = it.first; + const tir::Var& context = it.second; + Array sections = {"Device", device_name, hook}; + String device_hook_name = ToCFunctionStyle(PrefixName(sections)); + + tir::Evaluate device_hook( + AddCheckReturn(tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::call_extern(), + {tvm::tir::StringImm(device_hook_name), context}))); + device_hooks.push_back(device_hook); + } + return tir::SeqStmt(device_hooks); + } + + /*! + * \brief Generates a call to a given hook for a single Device function + * \param context Device context to call hook on + * \param hook Name of hook to generate statements for + * \return Statement with function call to Device API + */ + tir::Stmt GenerateDeviceHook(const tir::Var& context, const String& hook) { + const auto& it = std::find_if(std::begin(devices_), std::end(devices_), [&](const auto& it) { + return it.second->name_hint == context->name_hint; + }); + const String& device_name = (*it).first; + Array sections = {"Device", device_name, hook}; + String device_hook = ToCFunctionStyle(PrefixName(sections)); + + return tir::Evaluate( + AddCheckReturn(tir::Call(DataType::Int(32), tvm::tir::builtin::call_extern(), + {tvm::tir::StringImm(device_hook), context}))); + } + + /*! + * \brief Utility function to string together different arguments + */ + template + std::string MakeString(Args const&... args) { + std::ostringstream ss; + using List = int[]; + (void)List{0, ((void)(ss << args), 0)...}; + + return ss.str(); + } + + /*! + * \brief Access IO vars using the buffer vars and + * not the actual var. + */ + tir::Var GetBufferVarForIO(int index) { return main_buffer_map_[main_signature_[index]]->data; } + + /*! + * \brief Create tir::Var for input/output while updating the buffer_maps. + * \param expr The expression to evaluate. + * \param original_name The name of the tir::Var. + * \param use_unique_name Whether to generate a new unique name where a name conflicts. + */ + void CreateIOVar(const Expr& expr, const std::string& original_name, + bool use_unique_name = true) { + CreateIOVar(expr->checked_type(), original_name, use_unique_name); + } + + /*! + * \brief Create tir::Var for input/output while updating the buffer_maps. + * \param expr The expression to evaluate. + * \param original_name The name of the tir::Var. + * \param use_unique_name Whether to generate a new unique name where a name conflicts. + */ + void CreateIOVar(const Type& type, const std::string& original_name, + bool use_unique_name = true) { + if (type->IsInstance()) { + TupleType tuple_type = Downcast(type); + for (unsigned i = 0; i < tuple_type->fields.size(); i++) { + CreateIOVar(tuple_type->fields[i], original_name); + } + } else { + std::string name = original_name; + if (use_unique_name) { + name = GetUniqueIOVarName(original_name); + } + tir::Var var = tir::Var(name, DataType::Handle()); + main_signature_.push_back(var); + auto tensor_type = type.as(); + ICHECK(tensor_type) << "Expected TensorType node but was " << type->GetTypeKey(); + DataType elem_type = tensor_type->dtype; + tir::Var buffer_var = + tir::Var(name + "_buffer_var", PointerType(PrimType(elem_type), "global")); + tir::Buffer buffer = tir::Buffer(buffer_var, elem_type, tensor_type->shape, {}, 0, + name + "_buffer", 16, 1, tir::BufferType::kDefault); + main_buffer_map_.Set(var, buffer); + } + } + + /*! + * \brief Create a unique name for I/O Var + */ + std::string GetUniqueIOVarName(std::string name) { + if (io_var_names_.find(name) == io_var_names_.end()) { + io_var_names_[name] = 1; + return name + std::to_string(io_var_names_[name] - 1); + } else { + io_var_names_[name] = io_var_names_[name] + 1; + return name + std::to_string(io_var_names_[name] - 1); + } + } + + /*! \brief list of input expressions (i.e., variable passed by the user) */ + std::vector input_vars_; + /*! \brief map of device contexts variables */ + Map devices_; + /*! \brief map of GlobalVars to C Device API contexts */ + Map device_contexts_; + /*! \brief input and output variables belonging to the main function signature */ + Array main_signature_; + /*! \brief input and output variables belonging to the main function signature */ + Map main_buffer_map_; + /*! \brief All available targets. */ + CompilationConfig config_; + /*! + * \brief The type of kernel call to be emitted. + * See CallType for more documentation. + */ + CallType call_type_; + std::unordered_map + constant_map_; + /*! \brief plan memory of device result */ + StorageMap expr_storage_map_; + /*! \brief mapping sid -> tir::Var */ + std::unordered_map sids_table_; + /*! \brief the set of statements that make the program */ + std::vector stmts_; + /*! \brief the list of return sids (note that the function might return more then one output */ + std::vector return_sid_; + /*! \brief This is per IO var name counter to aid the generating unique names */ + std::unordered_map io_var_names_; + /*! \brief A set of variables that are let bound. */ + std::unordered_set let_bound_vars_; +}; + +Pass AOTLowerMain(String mod_name, tvm::CompilationConfig config, CallType call_type) { + runtime::TypedPackedFunc pass_func = + [=](IRModule module, transform::PassContext ctx) { + return AOTMainLowerer(config, call_type).Lower(module, mod_name); + }; + + return tvm::transform::CreateModulePass(pass_func, 0, "AOTLowerMain", {"InferType"}); +} + +TVM_REGISTER_GLOBAL("relay.backend.aot.AOTLowerMain") + .set_body_typed([](const String& mod_name, const tvm::CompilationConfig& config, + int call_type) { + return AOTLowerMain(mod_name, config, static_cast(call_type)); + }); + +} // namespace aot +} // namespace backend +} // namespace relay +} // namespace tvm diff --git a/src/relay/backend/aot/aot_lower_main.h b/src/relay/backend/aot/aot_lower_main.h new file mode 100644 index 000000000000..8981e7d7434f --- /dev/null +++ b/src/relay/backend/aot/aot_lower_main.h @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_RELAY_BACKEND_AOT_AOT_LOWER_MAIN_H_ +#define TVM_RELAY_BACKEND_AOT_AOT_LOWER_MAIN_H_ + +#include +#include + +#include +#include +#include + +#include "../utils.h" + +namespace tvm { +namespace relay { +namespace backend { +namespace aot { + +using StorageMap = + std::unordered_map; + +/*! \brief Exposed for testing, part of the implementation of AOTLowerMain */ +std::tuple> CreateStorage(const Function& func); + +/*! \brief Lower the Relay main function into TIR for use with the AOT executor. + * + * This pass expects that all operators have already been lowered to TIR and + * so only Calls to 'call_lowered' are present in main. + * + * \param mod_name The name of the module. + * \param config The compilation config. + * \param call_type The call type to use when calling functions. + */ +transform::Pass AOTLowerMain(String mod_name, tvm::CompilationConfig config, CallType call_type); + +} // namespace aot +} // namespace backend +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_BACKEND_AOT_AOT_LOWER_MAIN_H_ diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc index 5cf7a5563d19..51bcab527d1b 100644 --- a/src/relay/backend/utils.cc +++ b/src/relay/backend/utils.cc @@ -138,8 +138,20 @@ TVM_REGISTER_GLOBAL("relay.ir.StaticMemoryPlan") return StaticMemoryPlan(expr_to_storage_info); }); -// TODO(mbs): Cf GetMemorySizeBytes in aot_executor_codegen.cc, GetMemorySize in -// graph_plan_memory.cc +size_t DivRoundUp(size_t size, size_t word_size) { return (size + word_size - 1) / word_size; } + +size_t GetMemorySizeBytes(const Array& shape, const DataType& dtype) { + size_t size = 1; + for (IndexExpr dim : shape) { + const int64_t* pval = tir::as_const_int(dim); + ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << shape; + ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval; + size *= static_cast(pval[0]); + } + size *= DivRoundUp(dtype.bits() * dtype.lanes(), 8); + return size; +} + int64_t CalculateRelayExprSizeBytes(const Type& expr_type) { if (expr_type->IsInstance()) { auto tuple_type = Downcast(expr_type); @@ -152,17 +164,7 @@ int64_t CalculateRelayExprSizeBytes(const Type& expr_type) { auto tensor_type = expr_type.as(); ICHECK(tensor_type); auto shape = tensor_type->shape; - int num_of_elements = 1; - for (const auto& dim_index_expr : shape) { - if (dim_index_expr->IsInstance()) { - num_of_elements *= dim_index_expr.as()->value; - } else { - // If shape is dynamic, we cannot calculate workspace in compile time. - num_of_elements = 0; - } - } - auto element_size = tensor_type->dtype.bytes(); - return element_size * num_of_elements; + return GetMemorySizeBytes(tensor_type->shape, tensor_type->dtype); } TVM_REGISTER_NODE_TYPE(FunctionInfoNode); diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h index 37ae9d803a35..6c65a081f156 100644 --- a/src/relay/backend/utils.h +++ b/src/relay/backend/utils.h @@ -59,6 +59,73 @@ class TECompiler; namespace backend { using Pass = tvm::transform::Pass; +/*! \brief Describes the type of kernel call emitted. */ +enum CallType { + /*! + * \brief Emit PackedFunc calls bound just-in-time using TVMBackend* functions. + * + * When this type is selected, assumes all operators must be called via TVMFuncCall. Given the + * implementation of TVMFuncCall in the C++ runtime, this in practice implies that those + * functions are of type TVMBackendPackedCFunc. + * + * The following code is emitted at call sites to call a function named `func`: + * void* func_ptr = TVMBackendGetFuncFromEnv("func"); + * TVMFuncCall(func_ptr, values, tcodes, num_args, ret_values, ret_tcodes) + * + * The arguments given to the tir::Call node are encoded into `values`, `tcodes`, and `num_args` + * by LowerTVMBuiltin TIR transform. + * + * If `resource_handle` is passed to `func`, it is determined by TVMFuncCall (often, + * `resource_handle` is registered with the C++ runtime to provide a `this` equivalent when + * `func` is implemented in C). + * + * Compatible with both C++ and C runtimes, implemented with the C runtime only. + */ + kPacked, // Emit tir.call_packed and wrap all arguments in DLTensor. + + /*! + * \brief Directly call a TVMBackendPackedCFunc named according to the tir::Call. + * + * When this type is selected, assumes all operators are implemented in functions of type + * `TVMBackendPackedCFunc` and should be called directly. That is, presumes at the time of + * downstream compilation that there is a symbol named after the 0th arg to tir::Call of + * type `TVMBackendPackedCFunc`. This situation should occur when target_host == target. + * + * The following code is emitted at call sites to call a function named `func`: + * func(values, tcodes, num_args, ret_values, ret_tcodes, resource_handle) + * + * The arguments given to the tir::Call node are encoded into `values`, `tcodes`, and `num_args` + * by LowerTVMBuiltin TIR transform. + * + * `resource_handle` is encoded as the final argument to the tir::Call node. In practice, it is + * always the device context parameter when not null. At present, the implementation does not + * support forwarding device context parameters to CPacked. + * + * Compatible with the C runtime and C++ runtime (so long as target_host == target). Implemented + * in the same scenarios. + */ + kCPacked, // Emit tir.call_cpacked and wrap all arguments in DLTensor. + + /*! \brief Directly call a function accepting the `data` arrays as args. + * + * When this type is selected, assumes all operaotrs are implemented in C functions whose + * arguments are 1-to-1 with those in the tir::Call. DLTensor arguments are encoded as just the + * `data` parameters (i.e. no DLTensor object is passed along). + * + * The following code is emitted at call sites to a function named `func`: + * func(void* arg0, void* arg1, ..., void* argN) // no resource_handle + * -or- + * func(void* arg0, void* arg1, ..., void* argN, void* resource_handle) // with resource_handle + * + * `resource_handle` is encoded as the final argument to the tir::Call node. In practice, it is + * always the device context parameter when not null. + * + * Compatible with the C runtime and C++ runtime (so long as target_host == target). Implemented + * with the C runtime only. + */ + kUnpacked, // Emit tir.call_extern passing only the `data` part of DLTensors. +}; + /*! * \brief Structure that can be optionally used by the executor codegen */ @@ -207,6 +274,13 @@ class FunctionInfo : public ObjectRef { TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(FunctionInfo, ObjectRef, FunctionInfoNode); }; +/*! + * \brief Calculate the bytes of memory needed to hold a tensor of a given shape and data type. + * \param shape The shape of the tensor + * \param dtype The data type of the tensor + */ +size_t GetMemorySizeBytes(const Array& shape, const DataType& dtype); + /*! * \brief Calculate the storage required to store the type of relay.Expr * diff --git a/tests/cpp/relay/backend/aot/aot_lower_main_test.cc b/tests/cpp/relay/backend/aot/aot_lower_main_test.cc new file mode 100644 index 000000000000..31166f1e6bb8 --- /dev/null +++ b/tests/cpp/relay/backend/aot/aot_lower_main_test.cc @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "../../../../../src/relay/backend/aot/aot_lower_main.h" + +#include +#include + +namespace tvm { +namespace relay { +namespace backend { +namespace aot { + +TEST(AOTLowerMain, ExprAllocatorSkipNestedFunc) { + constexpr const char* mod_text = R"( + #[version = "0.0.5"] + def @main(%x: Tensor[(10, 10), float32]) { + %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32]) { + nn.relu(%FunctionVar_01) + }; + %0(%x) + } + )"; + IRModule mod = parser::ParseModule("string", mod_text, {}, {}); + auto host_target = tvm::Target("llvm"); + auto prim_target = tvm::Target(host_target, host_target); + auto ctxt = tvm::transform::PassContext::Current(); + auto config = tvm::CompilationConfig(ctxt, {prim_target}); + mod = tvm::relay::transform::PlanDevices(config)(mod); + mod = tvm::relay::transform::InferType()(mod); + + StorageMap storage_map; + std::vector return_sids; + auto func = Downcast(mod->Lookup("main")); + std::tie(storage_map, return_sids) = CreateStorage(func); + + auto nested_func = Downcast(Downcast(func->body)->op); + EXPECT_EQ(storage_map.find(nested_func->body), storage_map.end()); + EXPECT_EQ(storage_map.find(nested_func->params[0]), storage_map.end()); + EXPECT_NE(storage_map.find(func->body), storage_map.end()); + EXPECT_NE(storage_map.find(func->params[0]), storage_map.end()); +} + +} // namespace aot +} // namespace backend +} // namespace relay +} // namespace tvm diff --git a/tests/python/relay/aot/test_pass_aot_lower_main.py b/tests/python/relay/aot/test_pass_aot_lower_main.py new file mode 100644 index 000000000000..c583b287727a --- /dev/null +++ b/tests/python/relay/aot/test_pass_aot_lower_main.py @@ -0,0 +1,429 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=line-too-long,missing-class-docstring,missing-module-docstring,missing-function-docstring,no-self-argument,unused-argument,invalid-name +import numpy as np +import pytest + +import tvm +import tvm.testing +from tvm.script import tir as T +from tvm.relay.backend.aot import AOTLowerMain, CallType + + +def _make_const(dtype, shape): + return tvm.relay.const(np.zeros(shape).astype(dtype)) + + +def _make_consts(dtype, shapes): + return [_make_const(dtype, shape) for shape in shapes] + + +def _plan_devices(mod): + host_target = tvm.target.Target("llvm") + prim_target = tvm.target.Target("llvm", host=host_target) + ctxt = tvm.transform.PassContext() + config = tvm.target.make_compilation_config(ctxt, prim_target) + mod = tvm.relay.transform.PlanDevices(config)(mod) + mod = tvm.relay.transform.InferType()(mod) + return mod, config + + +def _assert_lowered_main(mod, main_func, call_type, print_script=False): + mod, config = _plan_devices(mod) + mod = AOTLowerMain("test_mod", config, call_type)(mod) + if print_script: + print(mod["__tvm_main__"].script()) + + assert mod["__tvm_main__"].script() == main_func.script() + + +def test_single_call_cpacked(): + mod = tvm.parser.parse( + """ +#[version = "0.0.5"] +def @test_fused_add(%x: Tensor[(5, 7), float32]) { %x } + +def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { + %0 = (%a,) /* ty=(Tensor[(5, 7), float32],) */; + call_lowered(@test_fused_add, %0) /* ty=Tensor[(5, 7), float32] */ +} + """, + ) + + # fmt: off + @T.prim_func + def func(a: T.handle, output: T.handle) -> None: + # function attr dict + T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]}) + a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16) + output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16) + # body + T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32")) + # fmt: on + + _assert_lowered_main(mod, func, CallType.CPacked) + + +def test_single_call_packed(): + mod = tvm.parser.parse( + """ +#[version = "0.0.5"] +def @test_fused_add(%x: Tensor[(5, 7), float32]) { %x } + +def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { + %0 = (%a,) /* ty=(Tensor[(5, 7), float32],) */; + call_lowered(@test_fused_add, %0) /* ty=Tensor[(5, 7), float32] */ +} + """, + ) + + # fmt: off + @T.prim_func + def func(a: T.handle, output: T.handle) -> None: + # function attr dict + T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]}) + a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16) + output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16) + # body + T.evaluate(T.tvm_check_return(0, -1, T.tvm_call_packed("test_fused_add", a_buffer.data, output_buffer.data, dtype="int32"), dtype="int32")) + # fmt: on + + _assert_lowered_main(mod, func, CallType.Packed) + + +def test_single_call_unpacked(): + mod = tvm.parser.parse( + """ +#[version = "0.0.5"] +def @test_fused_add(%x: Tensor[(5, 7), float32]) { %x } + +def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { + %0 = (%a,) /* ty=(Tensor[(5, 7), float32],) */; + call_lowered(@test_fused_add, %0) /* ty=Tensor[(5, 7), float32] */ +} + """, + ) + + # fmt: off + @T.prim_func + def func(a: T.handle, output: T.handle) -> None: + # function attr dict + T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]}) + a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16) + output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16) + # body + T.evaluate(T.tvm_check_return(0, -1, T.call_extern("test_fused_add", a_buffer.data, output_buffer.data, dtype="int32"), dtype="int32")) + # fmt: on + + _assert_lowered_main(mod, func, CallType.Unpacked) + + +def test_constant(): + mod = tvm.parser.parse( + """ +#[version = "0.0.5"] +def @test_fused_add(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) { %x } + +def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { + %0 = (%a, meta[relay.Constant][0]) /* ty=(Tensor[(5, 7), float32], Tensor[(5, 7), float32]) */; + call_lowered(@test_fused_add, %0) /* ty=Tensor[(5, 7), float32] */ +} + """, + init_meta_table={"relay.Constant": _make_consts("float32", [(5, 7)])}, + ) + + # fmt: off + @T.prim_func + def func(a: T.handle, output: T.handle) -> None: + # function attr dict + T.func_attr({"runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "global_symbol": "test_mod___tvm_main__", "input_vars": [a], "output_vars": [output]}) + a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16) + output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16) + # body + constant_0 = T.allocate_const([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "float32", [5, 7]) + T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, constant_0, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32")) + # fmt: on + + _assert_lowered_main(mod, func, CallType.CPacked) + + +# TODO(@mbaret) There seems to be a TVMScript round-trip bug causing this to fail +@pytest.mark.xfail() +def test_copy_to_output(): + mod = tvm.parser.parse( + """ +#[version = "0.0.5"] +def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { + %a +} + """, + ) + + # fmt: off + @T.prim_func + def func(a: T.handle, output: T.handle) -> None: + # function attr dict + T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]}) + tmp_read = T.buffer_var("uint8", "") + # buffer definition + tmp_read_1 = T.buffer_decl([T.uint64(140)], dtype="uint8", data=tmp_read) + a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16) + output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16) + # body + tmp_write: T.Ptr[T.uint8] = output_buffer.data + tmp_write_1 = T.buffer_decl([T.uint64(140)], dtype="uint8", data=tmp_write) + for i in T.serial(140): + tmp_write_1[i] = T.let(tmp_read, a_buffer.data, tmp_read_1[i]) + # fmt: on + + _assert_lowered_main(mod, func, CallType.CPacked) + + +def test_two_calls(): + mod = tvm.parser.parse( + """ +#[version = "0.0.5"] +def @test_fused_add(%x: Tensor[(5, 7), float32]) { %x } + +def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { + %0 = (%a,) /* ty=(Tensor[(5, 7), float32],) */; + %1 = call_lowered(@test_fused_add, %0) /* ty=Tensor[(5, 7), float32] */; + %2 = (%1,) /* ty=(Tensor[(5, 7), float32],) */; + call_lowered(@test_fused_add, %2) /* ty=Tensor[(5, 7), float32] */ +} + """, + ) + + # fmt: off + @T.prim_func + def func(a: T.handle, output: T.handle) -> None: + # function attr dict + T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]}) + a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16) + output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16) + # body + sid_2 = T.allocate([140], "int8", "global.workspace") + T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, sid_2, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32")) + T.evaluate(T.tvm_call_cpacked("test_fused_add", sid_2, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32")) + # fmt: on + + _assert_lowered_main(mod, func, CallType.CPacked) + + +def test_tuple_output(): + mod = tvm.parser.parse( + """ +#[version = "0.0.5"] +def @test_fused_add(%x: Tensor[(5, 7), float32]) { (%x, %x) } + +def @main(%a: Tensor[(5, 7), float32]) -> (Tensor[(5, 7), float32], Tensor[(5, 7), float32]) { + %0 = (%a,) /* ty=(Tensor[(5, 7), float32],) */; + call_lowered(@test_fused_add, %0) /* ty=(Tensor[(5, 7), float32], Tensor[(5, 7), float32]) */ +} + """, + ) + + # fmt: off + @T.prim_func + def func(a: T.handle, output0: T.handle, output1: T.handle) -> None: + # function attr dict + T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output0, output1]}) + a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16) + output0_buffer = T.match_buffer(output0, [5, 7], dtype="float32", align=16) + output1_buffer = T.match_buffer(output1, [5, 7], dtype="float32", align=16) + # body + T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, output0_buffer.data, output1_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32")) + # fmt: on + + _assert_lowered_main(mod, func, CallType.CPacked) + + +def test_tuple_intermediate(): + mod = tvm.parser.parse( + """ +#[version = "0.0.5"] +def @test_fused_add_0(%x: Tensor[(5, 7), float32]) -> (Tensor[(5, 7), float32], Tensor[(5, 7), float32]) { (%x, %x) } +def @test_fused_add_1(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { %x } + +def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { + %0 = (%a,); + %1 = call_lowered(@test_fused_add_0, %0); + %2 = (%1.0, %1.1); + call_lowered(@test_fused_add_1, %2) +} + """, + ) + + # fmt: off + @T.prim_func + def func(a: T.handle, output: T.handle) -> None: + # function attr dict + T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]}) + a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16) + output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16) + # body + sid_3 = T.allocate([140], "int8", "global.workspace") + sid_2 = T.allocate([140], "int8", "global.workspace") + T.evaluate(T.tvm_call_cpacked("test_fused_add_0", a_buffer.data, sid_2, sid_3, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32")) + T.evaluate(T.tvm_call_cpacked("test_fused_add_1", sid_2, sid_3, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32")) + # fmt: on + + _assert_lowered_main(mod, func, CallType.CPacked) + + +def test_multi_input(): + mod = tvm.parser.parse( + """ +#[version = "0.0.5"] +def @test_fused_add(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) { %x } + +def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { + %0 = (%a, %b) /* ty=(Tensor[(5, 7), float32], Tensor[(5, 7), float32]) */; + call_lowered(@test_fused_add, %0) /* ty=Tensor[(5, 7), float32] */ +} + """, + ) + + # fmt: off + @T.prim_func + def func(a: T.handle, b: T.handle, output: T.handle) -> None: + # function attr dict + T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a, b], "output_vars": [output]}) + a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16) + b_buffer = T.match_buffer(b, [5, 7], dtype="float32", align=16) + output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16) + # body + T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, b_buffer.data, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32")) + # fmt: on + + _assert_lowered_main(mod, func, CallType.CPacked) + + +def test_let_binding(): + mod = tvm.parser.parse( + """ +#[version = "0.0.5"] +def @test_fused_add(%x: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { %x } + +def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { + %0 = (%a,); + let %v1 = call_lowered(@test_fused_add, %0); + %v1 +} + """, + ) + + # fmt: off + @T.prim_func + def func(a: T.handle, output: T.handle) -> None: + # function attr dict + T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]}) + a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16) + output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16) + # body + T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32")) + # fmt: on + + _assert_lowered_main(mod, func, CallType.CPacked) + + +def test_let_binding_branch(): + mod = tvm.parser.parse( + """ +#[version = "0.0.5"] +def @test_fused_add_0(%x: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { %x } +def @test_fused_add_1(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { %x } + +def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { + %0 = (%a,); + let %v0 = call_lowered(@test_fused_add_0, %0); + %1 = (%v0,); + let %v1 = call_lowered(@test_fused_add_0, %1); + %2 = (%v1,); + let %v2 = call_lowered(@test_fused_add_0, %2); + %3 = (%v1, %v2); + let %v3 = call_lowered(@test_fused_add_1, %3); + %v3 +} + """, + ) + + # fmt: off + @T.prim_func + def func(a: T.handle, output: T.handle) -> None: + # function attr dict + T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]}) + a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16) + output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16) + # body + sid_3 = T.allocate([140], "int8", "global.workspace") + sid_2 = T.allocate([140], "int8", "global.workspace") + sid_1 = T.allocate([140], "int8", "global.workspace") + T.evaluate(T.tvm_call_cpacked("test_fused_add_0", a_buffer.data, sid_1, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32")) + T.evaluate(T.tvm_call_cpacked("test_fused_add_0", sid_1, sid_2, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32")) + T.evaluate(T.tvm_call_cpacked("test_fused_add_0", sid_2, sid_3, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32")) + T.evaluate(T.tvm_call_cpacked("test_fused_add_1", sid_2, sid_3, output_buffer.data, T.reinterpret(T.uint64(0), dtype="handle"), dtype="int32")) + # fmt: on + + _assert_lowered_main(mod, func, CallType.CPacked) + + +def test_device_hooks(): + mod = tvm.parser.parse( + """ +#[version = "0.0.5"] +def @test_fused_add(%x: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { %x } + +def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] { + %0 = (%a,); + %1 = call_lowered(@test_fused_add, %0); + %2 = (%1,); + call_lowered(@test_fused_add, %2) +} + """, + ) + + # fmt: off + @T.prim_func + def func(a: T.handle, output: T.handle, device_context_example_target_hook: T.handle) -> None: + # function attr dict + T.func_attr({"global_symbol": "test_mod___tvm_main__", "runner_function": True, "target": T.target({"kind":"llvm", "tag":"", "keys":["cpu"]}), "input_vars": [a], "output_vars": [output]}) + a_buffer = T.match_buffer(a, [5, 7], dtype="float32", align=16) + output_buffer = T.match_buffer(output, [5, 7], dtype="float32", align=16) + # body + T.evaluate(T.tvm_check_return(0, -1, T.call_extern("TVMDeviceExampleTargetHookActivate", device_context_example_target_hook, dtype="int32"), dtype="int32")) + with T.allocate([140], "int8", "global.workspace") as sid_2: + T.evaluate(T.tvm_check_return(0, -1, T.call_extern("TVMDeviceExampleTargetHookOpen", device_context_example_target_hook, dtype="int32"), dtype="int32")) + T.evaluate(T.tvm_call_cpacked("test_fused_add", a_buffer.data, sid_2, device_context_example_target_hook, dtype="int32")) + T.evaluate(T.tvm_check_return(0, -1, T.call_extern("TVMDeviceExampleTargetHookClose", device_context_example_target_hook, dtype="int32"), dtype="int32")) + T.evaluate(T.tvm_check_return(0, -1, T.call_extern("TVMDeviceExampleTargetHookOpen", device_context_example_target_hook, dtype="int32"), dtype="int32")) + T.evaluate(T.tvm_call_cpacked("test_fused_add", sid_2, output_buffer.data, device_context_example_target_hook, dtype="int32")) + T.evaluate(T.tvm_check_return(0, -1, T.call_extern("TVMDeviceExampleTargetHookClose", device_context_example_target_hook, dtype="int32"), dtype="int32")) + T.evaluate(T.tvm_check_return(0, -1, T.call_extern("TVMDeviceExampleTargetHookDeactivate", device_context_example_target_hook, dtype="int32"), dtype="int32")) + # fmt: on + + device_contexts = {} + for gv in mod.get_global_vars(): + device_contexts[gv] = "example_target_hook" + + mod = mod.with_attr("device_contexts", device_contexts) + + _assert_lowered_main(mod, func, CallType.CPacked) + + +if __name__ == "__main__": + tvm.testing.main() From 2aa0d1fbfcf4a31e343cc6852fdc4abd660c850a Mon Sep 17 00:00:00 2001 From: Siva Date: Wed, 14 Sep 2022 15:18:03 +0530 Subject: [PATCH 162/704] [OpenCLML] More ops and network coverage (#12762) Added operators pooling (avg, max), binary operators (add, subtract, multiply, min, max) and concat. Clip operator with min=0 and max=6 is remapped to relu6 to take advantage of CLML acceleration without sub graphing this to fallback path. Added new test cases for above listed operators and also end-to-end network test cases for Resnet50 & InceptionV3. CLML support FP16 arithmetic mode which gives significant performance boost over FP32. This PR enhances FP16 usage based on Operator datatype in relay graph. Co-authored-by: Krishna Raju quic_kvegiraj@quicinc.com Co-authored-by: Shwetank Singh quic_shwesing@quicinc.com --- python/tvm/relay/op/contrib/clml.py | 35 +- src/relay/backend/contrib/clml/codegen.cc | 37 ++ src/runtime/contrib/clml/clml_runtime.cc | 315 +++++++++++++++--- .../contrib/test_clml/infrastructure.py | 28 +- .../python/contrib/test_clml/test_network.py | 139 ++++++-- tests/python/contrib/test_clml/test_ops.py | 83 ++++- 6 files changed, 529 insertions(+), 108 deletions(-) diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py index cacd10de2865..d253544d45d9 100644 --- a/python/tvm/relay/op/contrib/clml.py +++ b/python/tvm/relay/op/contrib/clml.py @@ -23,7 +23,7 @@ from tvm.relay import transform from tvm.relay.build_module import bind_params_by_name -from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item +from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item, is_tuple from .register import register_pattern_table from ..strategy.generic import is_depthwise_conv2d @@ -135,6 +135,7 @@ def conv_pattern(): """Create a convolution pattern.""" pattern = is_op("nn.conv2d")(wildcard(), is_constant()) pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant())) + pattern = pattern.optional(lambda x: is_op("add")(x, is_constant())) pattern = pattern.optional( lambda x: is_op("nn.batch_norm")( x, is_constant(), is_constant(), is_constant(), is_constant() @@ -142,6 +143,7 @@ def conv_pattern(): ) pattern = pattern.optional(is_tuple_get_item) pattern = pattern.optional(is_op("nn.relu")) + pattern = pattern.optional(is_op("clip")) return pattern def batch_norm_pattern(): @@ -152,10 +154,24 @@ def batch_norm_pattern(): pattern = is_tuple_get_item(pattern) return pattern + def concat_pattern(): + """Create a concat pattern. + + Returns + ------- + pattern : dataflow_pattern.AltPattern + Denotes the concat pattern. + """ + pattern = is_tuple(None) + pattern = is_op("concatenate")(pattern) + + return pattern + def dense_pattern(): """Create a dense pattern.""" pattern = is_op("nn.dense")(wildcard(), is_constant()) pattern = pattern.optional(lambda x: is_op("add")(x, is_constant())) + pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant())) return pattern def pad_pattern(): @@ -172,6 +188,13 @@ def check_conv(extract): call = call.args[0] if isinstance(call, tvm.relay.expr.TupleGetItem): call = call.tuple_value + elif call.op.name == "clip": + if call.attrs["a_min"] != 0.0 or call.attrs["a_max"] != 6.0: + return False + call = call.args[0] + if isinstance(call, tvm.relay.expr.TupleGetItem): + call = call.tuple_value + while call.op.name != "nn.conv2d": call = call.args[0] attrs, args = call.attrs, call.args @@ -194,6 +217,7 @@ def check_conv(extract): ("clml.conv2d", conv_pattern(), check_conv), ("clml.dense", dense_pattern()), ("clml.pad", pad_pattern()), + ("clml.concat", concat_pattern()), ("clml.batch_norm", batch_norm_pattern()), ] @@ -207,11 +231,18 @@ def _func_wrapper(expr): _register_external_op_helper("clip") -_register_external_op_helper("relu") +_register_external_op_helper("nn.relu") _register_external_op_helper("nn.global_avg_pool2d") _register_external_op_helper("nn.global_max_pool2d") +_register_external_op_helper("nn.avg_pool2d") +_register_external_op_helper("nn.max_pool2d") _register_external_op_helper("nn.softmax") _register_external_op_helper("reshape") +_register_external_op_helper("add") +_register_external_op_helper("subtract") +_register_external_op_helper("multiply") +_register_external_op_helper("minimum") +_register_external_op_helper("maximum") class OpAttrContext(object): diff --git a/src/relay/backend/contrib/clml/codegen.cc b/src/relay/backend/contrib/clml/codegen.cc index fa082a423d78..b89f05e17857 100644 --- a/src/relay/backend/contrib/clml/codegen.cc +++ b/src/relay/backend/contrib/clml/codegen.cc @@ -91,6 +91,8 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer { json_node = CreateDenseJSONNode(cn); } else if (name == "clml.pad") { json_node = CreatePadJSONNode(cn); + } else if (name == "clml.concat") { + json_node = CreateConcatJSONNode(cn); } else { LOG(FATAL) << "Unrecognized CLML pattern: " << name; } @@ -148,6 +150,15 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer { } else { current_call = current_call->args[0].as(); } + } else if (backend::IsOp(current_call, "clip")) { + nodes.activation = current_call; + nodes.act_type = "relu6"; + if (current_call->args[0].as()) { + auto tuple_item = current_call->args[0].as(); + current_call = tuple_item->tuple.as(); + } else { + current_call = current_call->args[0].as(); + } } if (backend::IsOp(current_call, "nn.batch_norm")) { nodes.bn = current_call; @@ -279,6 +290,32 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer { return json_node; } + /*! + * \brief Create a JSON representation of a Concat operator. + * + * \param cn The call to be represented. + * \return A JSON representation of a specific operator. + */ + std::shared_ptr CreateConcatJSONNode(const CallNode* cn) { + const auto* fn = cn->op.as(); + ICHECK(fn); + const auto* concat = fn->body.as(); + + ICHECK(backend::IsOp(concat, "concatenate")); + const auto* concat_op = concat->op.as(); + ICHECK(concat_op); + const std::string name = concat_op->name; + + std::vector inputs; + for (auto arg : cn->args) { + inputs.push_back(VisitExpr(arg)[0]); + } + + auto json_node = std::make_shared(name, "kernel", inputs, 1); + SetCallNodeAttribute(json_node, concat); + return json_node; + } + /*! * \brief Create a JSON representation of a Dense operator. * diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc index da41442ef91d..cdc3b9a7b51c 100644 --- a/src/runtime/contrib/clml/clml_runtime.cc +++ b/src/runtime/contrib/clml/clml_runtime.cc @@ -335,13 +335,15 @@ class CLMLRuntime : public JSONRuntimeBase { size_t nid; for (nid = 0; nid < nodes_.size(); ++nid) { const auto& node = nodes_[nid]; + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); if (node.GetOpType() == "input") { - auto clml_input = MakeCLMLTensorFromJSONNode(node); + auto clml_input = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); this->layer_.storage_map.insert({nid, std::make_pair(clml_input, node)}); this->layer_.inputs.push_back(clml_input); // Input copy placeholder Tensor this->layer_.in_placeholder.push_back( - MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM)); + MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype)); } else if (node.GetOpType() == "kernel") { auto op_name = node.GetOpName(); if ("nn.conv2d" == op_name) { @@ -364,6 +366,11 @@ class CLMLRuntime : public JSONRuntimeBase { auto out = CreateBatchNormLayer(&layer_, node); this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); this->layer_.func_outs.push_back(out); + } else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name || + "nn.l2_pool2d" == op_name) { + auto out = CreatePoolingLayer(&layer_, node); + this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); + this->layer_.func_outs.push_back(out); } else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" == op_name) { auto out = CreateGlobalPoolingLayer(&layer_, node); this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); @@ -372,6 +379,10 @@ class CLMLRuntime : public JSONRuntimeBase { auto out = CreateReshapeLayer(&layer_, node); this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); this->layer_.func_outs.push_back(out); + } else if ("concatenate" == op_name) { + auto out = CreateConcatLayer(&layer_, node); + this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); + this->layer_.func_outs.push_back(out); } else if ("nn.dense" == op_name) { auto out = CreateDenseLayer(&layer_, node); this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); @@ -388,6 +399,11 @@ class CLMLRuntime : public JSONRuntimeBase { auto out = CreateClipLayer(&layer_, node); this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); this->layer_.func_outs.push_back(out); + } else if ("add" == op_name || "subtract" == op_name || "multiply" == op_name || + "minimum" == op_name || "maximum" == op_name) { + auto out = CreateBinaryLayer(&layer_, node); + this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); + this->layer_.func_outs.push_back(out); } else { LOG(FATAL) << "Unsupported op: " << op_name; } @@ -396,10 +412,14 @@ class CLMLRuntime : public JSONRuntimeBase { LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType(); } } - if (nid > 0) { - this->layer_.outputs.push_back(this->layer_.storage_map[nid - 1].first); + + for (size_t i = 0; i < outputs_.size(); ++i) { + nid = outputs_[i].id_; + DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + this->layer_.outputs.push_back(this->layer_.storage_map[nid].first); this->layer_.out_placeholder.push_back( - MakeCLMLTensorFromJSONNode(nodes_[nid - 1], CL_TENSOR_LAYOUT_NCHW_QCOM)); + MakeCLMLTensorFromJSONNode(nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype)); } // ALlocate device memories and initialize the params if any cl_int result = 0; @@ -558,6 +578,20 @@ class CLMLRuntime : public JSONRuntimeBase { } } + cl_arithmetic_mode_qcom MakeCLArithMode(const cl_channel_type& data_type, + const cl_channel_type& acc_type = CL_FLOAT) { + if (data_type == CL_FLOAT && acc_type == CL_FLOAT) { + return CL_ARITHMETIC_MODE_FP32_QCOM; + } else if (data_type == CL_HALF_FLOAT && acc_type == CL_FLOAT) { + return CL_ARITHMETIC_MODE_FP16_ACC32_QCOM; + } else if (data_type == CL_HALF_FLOAT && acc_type == CL_HALF_FLOAT) { + return CL_ARITHMETIC_MODE_FP16_QCOM; + } else { + LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime"; + return CL_ARITHMETIC_MODE_FP32_QCOM; + } + } + std::shared_ptr MakeCLMLTensor( const JSONGraphNode& tensor_rep, void* data, std::vector c_shape, cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_uint dtype = CL_FLOAT) { @@ -634,6 +668,9 @@ class CLMLRuntime : public JSONRuntimeBase { std::vector strides = node.GetAttr>("strides"); std::vector dilation = node.GetAttr>("dilation"); std::vector clml_padding = GetVectorValues(padding); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); if (!node.HasAttr("padding")) { clml_padding.resize(4); std::fill(clml_padding.begin(), clml_padding.end(), 0); @@ -668,7 +705,7 @@ class CLMLRuntime : public JSONRuntimeBase { has_act = true; } cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM, - CL_ARITHMETIC_MODE_FP32_QCOM}; + cl_arithmetic_mode}; // Collect inputs and outputs, handling nn.conv2d. std::vector inputs = node.GetInputs(); @@ -680,15 +717,15 @@ class CLMLRuntime : public JSONRuntimeBase { has_bias = (num_inputs == 3) || (num_inputs == 7); has_bn = (num_inputs == 6) || (num_inputs == 7); // Input - auto input = MakeCLMLTensorFromJSONEntry(inputs[0]); - + auto input = + MakeCLMLTensorFromJSONEntry(inputs[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); // Weight - auto weight = MakeCLMLTensorFromJSONEntry(inputs[1]); - + auto weight = + MakeCLMLTensorFromJSONEntry(inputs[1], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); // Bias auto bias = std::make_shared(); if (has_bias) { - bias = MakeCLMLTensorFromJSONEntry(inputs[2]); + bias = MakeCLMLTensorFromJSONEntry(inputs[2], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); } else { cl_ml_tensor_desc_qcom desc = {}; desc.num_dimensions = CL_TENSOR_UNUSED_QCOM; @@ -698,7 +735,7 @@ class CLMLRuntime : public JSONRuntimeBase { bias->tensor = layer_.unusedTensor; } // Output - auto output = MakeCLMLTensorFromJSONNode(node); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); cl_ml_op_convolution_desc_qcom conv_desc{mode, groups, 4, @@ -707,7 +744,7 @@ class CLMLRuntime : public JSONRuntimeBase { {clml_strides[0], clml_strides[1]}, {clml_dilation[0], clml_dilation[1]}, 0, - CL_ARITHMETIC_MODE_FP32_QCOM}; + cl_arithmetic_mode}; cl_ml_op_qcom op = NULL; if (!has_bn) { @@ -734,13 +771,16 @@ class CLMLRuntime : public JSONRuntimeBase { auto bn_var = std::make_shared(); auto bn_scale = std::make_shared(); auto bn_bias = std::make_shared(); - bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index], bn_shape); - bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1], bn_shape); - bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2], bn_shape); - bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3], bn_shape); - - cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, - CL_ARITHMETIC_MODE_FP32_QCOM}; + bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + + cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode}; if (!has_act) { result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormForwardQCOM( workspace->context, 0, &conv_desc, &bn_desc, input->tensor, weight->tensor, @@ -772,11 +812,15 @@ class CLMLRuntime : public JSONRuntimeBase { cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); - auto output = MakeCLMLTensorFromJSONNode(node); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM, - CL_ARITHMETIC_MODE_FP32_QCOM}; + cl_arithmetic_mode}; cl_ml_tensor_desc_qcom desc = {}; desc.num_dimensions = CL_TENSOR_UNUSED_QCOM; @@ -805,7 +849,11 @@ class CLMLRuntime : public JSONRuntimeBase { const JSONGraphNode& node) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); int axis = std::stoi(node.GetAttr>("axis")[0]); auto bn_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]); std::vector bn_shape = {1, 1, 1, 1}; @@ -814,15 +862,18 @@ class CLMLRuntime : public JSONRuntimeBase { auto bn_var = std::make_shared(); auto bn_scale = std::make_shared(); auto bn_bias = std::make_shared(); - bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], bn_shape); - bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], bn_shape); - bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3], bn_shape); - bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4], bn_shape); + bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); - auto output = MakeCLMLTensorFromJSONNode(node); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); - cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, - CL_ARITHMETIC_MODE_FP32_QCOM}; + cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode}; result = h_ClmlIntf->clCreateMLOpBatchNormForwardQCOM( workspace->context, 0, &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor, @@ -834,6 +885,61 @@ class CLMLRuntime : public JSONRuntimeBase { return output; } + /*! + * \brief Create a creating pooling layer. + * + * \note Currently global_max_pool2d and global_avg_pool2d are supported. + * + * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function. + * \param node The JSON representation of the operator. + */ + std::shared_ptr CreatePoolingLayer(CachedLayer* layer, + const JSONGraphNode& node) { + cl_int result = 0; + cl_ml_op_qcom op = NULL; + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]); + + std::vector windows = node.GetAttr>("pool_size"); + std::vector strides = node.GetAttr>("strides"); + std::vector padding = node.GetAttr>("padding"); + std::vector clml_window = GetVectorValues(windows); + std::vector clml_stride = GetVectorValues(strides); + std::vector clml_padding = GetVectorValues(padding); + + cl_ml_op_pooling_desc_qcom pool_desc = { + node.GetOpName() == "nn.max_pool2d" ? CL_POOLING_MODE_MAX_QCOM + : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM, + 4, // reserved + {clml_padding[0], clml_padding[1]}, + {clml_padding[2], clml_padding[3]}, + {clml_stride[0], clml_stride[1]}, + {clml_window[0], clml_window[1]}, + CL_PROPAGATE_NAN_QCOM, + cl_arithmetic_mode, + }; + + cl_ml_tensor_desc_qcom desc = {}; + cl_ml_tensor_qcom unusedTensor = NULL; + desc.num_dimensions = CL_TENSOR_UNUSED_QCOM; + result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &unusedTensor); + ICHECK(unusedTensor && result == CL_SUCCESS) << ":" << result; + + result = + h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(workspace->context, 0, &pool_desc, input->tensor, + unusedTensor, output->tensor, &op, tuning_cache); + ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result; + + layer_.func_ins.push_back(input); + layer->function.push_back(op); + return output; + } + /*! * \brief Create a global pooling layer. * @@ -846,8 +952,12 @@ class CLMLRuntime : public JSONRuntimeBase { CachedLayer* layer, const JSONGraphNode& node) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); - auto output = MakeCLMLTensorFromJSONNode(node); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]); cl_ml_op_pooling_desc_qcom pool_desc = { node.GetOpName() == "nn.global_max_pool2d" ? CL_POOLING_MODE_MAX_QCOM @@ -858,7 +968,7 @@ class CLMLRuntime : public JSONRuntimeBase { {1, 1}, {in_dims.w, in_dims.h}, CL_PROPAGATE_NAN_QCOM, - CL_ARITHMETIC_MODE_FP32_QCOM, + cl_arithmetic_mode, }; cl_ml_tensor_desc_qcom desc = {}; @@ -887,14 +997,17 @@ class CLMLRuntime : public JSONRuntimeBase { const JSONGraphNode& node) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); auto out_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]); - auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, CL_FLOAT, nullptr, + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype, nullptr, {out_dims.n, out_dims.c, 1, 1}); cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM, - CL_SOFTMAX_MODE_INSTANCE_QCOM, - CL_ARITHMETIC_MODE_FP32_QCOM}; + CL_SOFTMAX_MODE_INSTANCE_QCOM, cl_arithmetic_mode}; result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(workspace->context, 0, &softmax_desc, input->tensor, output->tensor, &op, tuning_cache); @@ -915,8 +1028,12 @@ class CLMLRuntime : public JSONRuntimeBase { const JSONGraphNode& node) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); - auto output = MakeCLMLTensorFromJSONNode(node); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); std::string pad_mode = node.GetAttr>("pad_mode")[0]; std::vector padding = node.GetAttr>("pad_width"); @@ -936,7 +1053,7 @@ class CLMLRuntime : public JSONRuntimeBase { clml_pad_mode, {0, 0}, {clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 0, 0, 0, 0}, - CL_ARITHMETIC_MODE_FP32_QCOM}; + cl_arithmetic_mode}; result = h_ClmlIntf->clCreateMLOpPadQCOM(workspace->context, 0, &pad_desc, input->tensor, output->tensor, &op, tuning_cache); @@ -957,8 +1074,11 @@ class CLMLRuntime : public JSONRuntimeBase { const JSONGraphNode& node) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); - auto output = MakeCLMLTensorFromJSONNode(node); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); result = h_ClmlIntf->clCreateMLOpReshapeQCOM(workspace->context, 0, input->tensor, output->tensor, &op, tuning_cache); @@ -969,6 +1089,42 @@ class CLMLRuntime : public JSONRuntimeBase { return output; } + /*! + * \brief Create a concat layer. + * + * + * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function. + * \param node The JSON representation of the operator. + */ + std::shared_ptr CreateConcatLayer(CachedLayer* layer, + const JSONGraphNode& node) { + cl_int result = 0; + cl_ml_op_qcom op = NULL; + std::vector input_ = node.GetInputs(); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + int inputSize = input_.size(); + int axis = std::stoi(node.GetAttr>("axis")[0]); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + cl_ml_tensor_qcom* concatInputs = new cl_ml_tensor_qcom[inputSize]; + for (int i = 0; i < inputSize; i++) { + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[i], {}, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + concatInputs[i] = input->tensor; + } + cl_ml_op_concat_desc_qcom concatDesc = {1, (cl_uint)inputSize, cl_arithmetic_mode}; + + result = h_ClmlIntf->clCreateMLOpConcatQCOM(workspace->context, 0, &concatDesc, concatInputs, + output->tensor, &op, tuning_cache); + ICHECK(op && result == CL_SUCCESS) << "Concat Error:" << result; + + layer->function.push_back(op); + + delete[] concatInputs; + return output; + } + /*! * \brief Create a dense layer. * @@ -980,21 +1136,27 @@ class CLMLRuntime : public JSONRuntimeBase { const JSONGraphNode& node) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto inp_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {1, inp_dims.c, 1, 1}, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); auto wt_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]); bool has_bias = node.GetInputs().size() == 3 ? true : false; - - auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, wt_dims.n, wt_dims.c}); + auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, wt_dims.n, wt_dims.c}, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); auto bias = std::make_shared(); if (has_bias) { auto bias_dims = get_tensor_dims(nodes_[node.GetInputs()[2].id_]); - bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {1, bias_dims.c, 1, 1}); + bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {1, bias_dims.c, 1, 1}, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); } cl_ml_op_fully_connected_desc_qcom fc_desc = {1, CL_FC_WEIGHT_TRANSFORM_TRANSPOSE_QCOM, - CL_ARITHMETIC_MODE_FP32_QCOM}; + cl_arithmetic_mode}; + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); - auto output = MakeCLMLTensorFromJSONNode(node); if (has_bias) { result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM( workspace->context, 0, &fc_desc, input->tensor, weight->tensor, bias->tensor, @@ -1021,15 +1183,17 @@ class CLMLRuntime : public JSONRuntimeBase { const JSONGraphNode& node) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); - auto output = MakeCLMLTensorFromJSONNode(node); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); cl_float a_max = std::stof(node.GetAttr>("a_max")[0]); cl_float a_min = std::stof(node.GetAttr>("a_min")[0]); - cl_ml_op_clip_desc_qcom clip_desc = {CL_CLIP_BY_VALUE_QCOM, - {{a_max}, CL_FLOAT}, - {{a_min}, CL_FLOAT}, - CL_ARITHMETIC_MODE_FP32_QCOM}; + cl_ml_op_clip_desc_qcom clip_desc = { + CL_CLIP_BY_VALUE_QCOM, {{a_max}, CL_FLOAT}, {{a_min}, CL_FLOAT}, cl_arithmetic_mode}; result = h_ClmlIntf->clCreateMLOpClipQCOM(workspace->context, 0, &clip_desc, input->tensor, output->tensor, &op, tuning_cache); @@ -1040,6 +1204,47 @@ class CLMLRuntime : public JSONRuntimeBase { return output; } + /*! + * \brief Create a Binary layer. + * + * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output. + * \param node The JSON representation of the operator. + */ + std::shared_ptr CreateBinaryLayer(CachedLayer* layer, + const JSONGraphNode& node) { + cl_int result = 0; + cl_ml_op_qcom op = NULL; + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input_a = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + auto input_b = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {}, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + std::string op_name = node.GetOpName(); + cl_binary_op_qcom binary_op = CL_TENSOR_OP_ADD_QCOM; + if (op_name == "subtract") + binary_op = CL_TENSOR_OP_SUB_QCOM; + else if (op_name == "multiply") + binary_op = CL_TENSOR_OP_MUL_QCOM; + else if (op_name == "minimum") + binary_op = CL_TENSOR_OP_MIN_QCOM; + else if (op_name == "maximum") + binary_op = CL_TENSOR_OP_MAX_QCOM; + cl_ml_op_binary_desc_qcom add_desc = { + binary_op, {{1.0}, CL_FLOAT}, {{1.0}, CL_FLOAT}, {{0.0}, CL_FLOAT}, cl_arithmetic_mode}; + + result = h_ClmlIntf->clCreateMLOpBinaryQCOM(workspace->context, 0, &add_desc, input_a->tensor, + input_b->tensor, output->tensor, &op, tuning_cache); + ICHECK(op && result == CL_SUCCESS) << op_name << " Node Error:" << result; + + layer_.func_ins.push_back(input_a); + layer_.func_ins.push_back(input_b); + layer->function.push_back(op); + return output; + } + /*! * \brief The network layers represented by acl functions. * \note Currently only supports a single layer. diff --git a/tests/python/contrib/test_clml/infrastructure.py b/tests/python/contrib/test_clml/infrastructure.py index 0cf76079e8fb..08b11525ecd2 100644 --- a/tests/python/contrib/test_clml/infrastructure.py +++ b/tests/python/contrib/test_clml/infrastructure.py @@ -29,6 +29,7 @@ from tvm.contrib import graph_executor from tvm.relay.op.contrib import clml from tvm.contrib import utils +from tvm import autotvm from tvm.autotvm.measure import request_remote from tvm.relay.expr_functor import ExprMutator, Call @@ -144,35 +145,28 @@ def skip_codegen_test(): return True -def build_module(mod, target, target_host, params=None, enable_clml=True): +def build_module(mod, target, target_host, params=None, enable_clml=True, tune_log=""): """Build module with option to build for CLML.""" if isinstance(mod, tvm.relay.expr.Call): mod = tvm.IRModule.from_expr(mod) - with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): - if enable_clml: - mod = clml.partition_for_clml(mod, params) - relay.backend.te_compiler.get().clear() - # print("Build Mod:", mod) - return relay.build(mod, target=target, target_host=target_host, params=params) + with autotvm.apply_history_best(tune_log): + with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): + if enable_clml: + mod = clml.partition_for_clml(mod, params) + relay.backend.te_compiler.get().clear() + return relay.build(mod, target=target, target_host=target_host, params=params) def build_and_run( - mod, - inputs, - outputs, - params, - device, - enable_clml=True, - no_runs=1, - config=None, + mod, inputs, outputs, params, device, enable_clml=True, no_runs=1, config=None, tune_log="" ): """Build and run the relay module.""" if config is None: config = {} try: - libm = build_module(mod, device.target, device.target_host, params, enable_clml) + libm = build_module(mod, device.target, device.target_host, params, enable_clml, tune_log) clml_modules = extract_clml_modules(libm) for mod in clml_modules: @@ -198,7 +192,7 @@ def build_and_run( for _ in range(no_runs): gen_module.run() out.append([gen_module.get_output(i) for i in range(outputs)]) - time_f = gen_module.module.time_evaluator("run", device.device.cl(0), number=50) + time_f = gen_module.module.time_evaluator("run", device.device.cl(0), number=1) cost = time_f().mean print("%g secs/iteration\n" % cost) return out diff --git a/tests/python/contrib/test_clml/test_network.py b/tests/python/contrib/test_clml/test_network.py index 405f5782ff2e..95f3a45baf78 100644 --- a/tests/python/contrib/test_clml/test_network.py +++ b/tests/python/contrib/test_clml/test_network.py @@ -25,20 +25,13 @@ from test_clml.infrastructure import skip_runtime_test, build_and_run, Device -def _build_and_run_network(mod, params, inputs, data, device, atol, rtol): +def _build_and_run_network(mod, params, inputs, data, device, atol, rtol, tvm_log=""): """Helper function to build and run a network.""" outputs = [] for clml in [True, False]: outputs.append( - build_and_run( - mod, - data, - 1, - params, - device, - enable_clml=clml, - )[0] + build_and_run(mod, data, 1, params, device, enable_clml=clml, tune_log=tvm_log)[0][0] ) return outputs @@ -55,11 +48,7 @@ def _get_keras_model(keras_model, inputs_dict, data): def get_bottom_top_model(model, layer_name): layer = model.get_layer(layer_name) bottom_input = model.layers[0].input - bottom_output = bottom_input - for layer in model.layers: - bottom_output = layer(bottom_output) - if layer.name == layer_name: - break + bottom_output = layer.output bottom_model = Model(bottom_input, bottom_output) return bottom_model @@ -81,6 +70,9 @@ def test_mobilenet(): def get_model(): from tensorflow.keras.applications import MobileNet + import tensorflow as tf + + tf.keras.backend.clear_session() mobilenet = MobileNet( include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000 @@ -106,32 +98,113 @@ def get_model(): ) # test - print("OpenCL:", outputs[0][0].asnumpy().shape) - print("CLML:", outputs[1][0].asnumpy().shape) + print("OpenCL:", outputs[0].asnumpy().shape) + print("CLML:", outputs[1].asnumpy().shape) - opencl_sort = np.argsort(outputs[1][0].asnumpy()).flatten() - clml_sort = np.argsort(outputs[0][0].asnumpy()).flatten() + opencl_sort = np.argsort(outputs[1].asnumpy()).flatten() + clml_sort = np.argsort(outputs[0].asnumpy()).flatten() tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, atol=1e-5) -""" - tvm.testing.assert_allclose( - ref_outputs, outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5) - print("OpenCL to Keras looks good") - tvm.testing.assert_allclose( - outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5) - print("OpenCL to CLML looks good") - exit(0) +def test_inception_v3(): + Device.load("test_config.json") + + if skip_runtime_test(): + return + + device = Device() + dtype = "float16" + + def get_model(): + from tensorflow.keras.applications import InceptionV3 + import tensorflow as tf + + tf.keras.backend.clear_session() + + inceptionV3 = InceptionV3( + include_top=True, weights=None, input_shape=(299, 299, 3), classes=1000 + ) + inputs = {inceptionV3.input_names[0]: ((1, 3, 299, 299), "float16")} + + data = {} + np.random.seed(0) + for name, (shape, dtype) in inputs.items(): + if dtype == "uint8": + low, high = 0, 1 + else: + low, high = -2, 1 + data[name] = np.random.uniform(low, high, shape).astype(dtype) + + mod, params, ref_outputs = _get_keras_model(inceptionV3, inputs, data) + return mod, params, inputs, data, ref_outputs + + mod, params, inputs, input_data, ref_outputs = get_model() + outputs = _build_and_run_network( + mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5 + ) + + opencl_sort = np.argsort(outputs[1].asnumpy()).flatten() + clml_sort = np.argsort(outputs[0].asnumpy()).flatten() + + tvm.testing.assert_allclose(opencl_sort[:5], clml_sort[:5], rtol=1e-5, atol=1e-5) + + +def test_resnet50v2(): + Device.load("test_config.json") + + if skip_runtime_test(): + return + + device = Device() + dtype = "float16" + + def get_model(): + from tensorflow.keras.applications import ResNet50V2 + import tensorflow as tf + + tf.keras.backend.clear_session() - tvm.testing.assert_allclose( - ref_outputs.transpose(0, 3, 1, 2), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5) - print("OpenCL to Keras looks good") - tvm.testing.assert_allclose( - outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5) - print("OpenCL to CLML looks good") -""" + model = ResNet50V2(include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000) + inputs_dict = {model.input_names[0]: ((1, 3, 224, 224), "float32")} + + data = {} + np.random.seed(0) + + for name, (shape, dtype) in inputs_dict.items(): + if dtype == "uint8": + low, high = 0, 1 + else: + low, high = -1, 1 + data[name] = np.random.uniform(low, high, shape).astype(dtype) + + """Convert Keras graph to relay.""" + inputs = {} + for name, (shape, _) in inputs_dict.items(): + inputs[model.input_names[0]] = shape + + ref_outputs = model.predict(data["input_1"].transpose(0, 2, 3, 1)) + + mod, params = relay.frontend.from_keras(model, inputs, layout="NCHW") + + return mod, params, inputs, data, ref_outputs + + mod, params, inputs, input_data, ref_outputs = get_model() + outputs = _build_and_run_network( + mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5 + ) + + # test + print("OpenCL:", outputs[0].asnumpy().shape) + print("CLML:", outputs[1].asnumpy().shape) + + opencl_sort = np.argsort(outputs[1].asnumpy()).flatten() + clml_sort = np.argsort(outputs[0].asnumpy()).flatten() + + tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, atol=1e-5) if __name__ == "__main__": test_mobilenet() + test_resnet50v2() + test_inception_v3() diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py index 13f49d152714..d14a5ec6e90d 100644 --- a/tests/python/contrib/test_clml/test_ops.py +++ b/tests/python/contrib/test_clml/test_ops.py @@ -211,6 +211,87 @@ def test_batchnorm(): ) +def test_concat(): + Device.load("test_config.json") + + if skip_runtime_test(): + return + + device = Device() + dtype = "float16" + in_shape_1 = (1, 16, 16, 16) + in_shape_2 = (1, 16, 16, 16) + a = relay.var("input_1", shape=in_shape_1, dtype=dtype) + b = relay.var("input_2", shape=in_shape_2, dtype=dtype) + low, high = -1, 1 + inputs = { + "input_1": tvm.nd.array(np.random.uniform(-1, 1, in_shape_1).astype(dtype)), + "input_2": tvm.nd.array(np.random.uniform(-1, 1, in_shape_2).astype(dtype)), + } + + params = {} + func = relay.concatenate((a, b), axis=1) + mod = IRModule.from_expr(func) + + opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0] + clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0] + + tvm.testing.assert_allclose( + clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3 + ) + + +def test_avgpool(): + Device.load("test_config.json") + + if skip_runtime_test(): + return + + device = Device() + dtype = "float16" + trials = [ + # input size pool_size stride paading + [(1, 64, 147, 147), (3, 3), (2, 2), (0, 0, 0, 0), "max"], + [(1, 192, 71, 71), (3, 3), (2, 2), (0, 0, 0, 0), "max"], + [(1, 288, 35, 35), (3, 3), (2, 2), (0, 0, 0, 0), "max"], + [(1, 768, 17, 17), (3, 3), (2, 2), (0, 0, 0, 0), "max"], + [(1, 2048, 17, 17), (3, 3), (2, 2), (0, 0, 0, 0), "max"], + [(1, 192, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"], + [(1, 256, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"], + [(1, 288, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"], + [(1, 768, 17, 17), (3, 3), (1, 1), (0, 0, 1, 1), "avg"], + [(1, 1280, 8, 8), (3, 3), (1, 1), (0, 0, 1, 1), "avg"], + ] + params = {} + for ( + input_shape, + pool_size, + stride, + padding, + pooling_type, + ) in trials: + a = relay.var("input_1", shape=input_shape, dtype=dtype) + input_arr = tvm.nd.array(np.random.uniform(-1, 1, input_shape).astype(dtype)) + inputs = { + "input_1": input_arr, + } + + if pooling_type == "max": + func = relay.nn.max_pool2d(a, pool_size=pool_size, strides=stride, padding=padding) + else: + func = relay.nn.avg_pool2d(a, pool_size=pool_size, strides=stride, padding=padding) + mod = IRModule.from_expr(func) + + opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0] + clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0] + + tvm.testing.assert_allclose( + clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3 + ) + + if __name__ == "__main__": test_conv2d() - test_batchnorm() + # test_batchnorm() + test_avgpool() + test_concat() From a40849342d250bd585e19434e4a2473fcf978bcb Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Wed, 14 Sep 2022 09:23:51 -0500 Subject: [PATCH 163/704] [Relay][TE] Use Relay parameter name to generated TE tensor name (#10516) * [Relay][TE] Use Relay parameter name to generated TE tensor name Previously, the TE placeholders representing relay function parameters were all named `"placeholder"`, which could be difficult to follow when debugging larger functions. --- .../ci_logs/resnet-18-NHWC-B1-cuda.json | 50 +++++++++---------- python/tvm/auto_scheduler/measure.py | 17 +++++-- .../tvm/auto_scheduler/relay_integration.py | 5 +- .../contrib/ethosu/tir_to_cs_translator.py | 2 +- src/relay/backend/te_compiler_cache.cc | 9 ++-- 5 files changed, 48 insertions(+), 35 deletions(-) diff --git a/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json index 7cb3a67067b0..c8b9f41a5ca9 100644 --- a/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json +++ b/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json @@ -1,26 +1,24 @@ -# Provide valid schedules for resnet-18 on GPU. -# This is used to run the tutorial on the documentation web server. -{"i": [["[\"d7b65649a4dd54becea0a52aabbc5af5\", 1, 1000, 1, 1000]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["SP", 4, 1, 1000, [40], 1], ["AN", 4, 2, 6], ["FSP", 3, 1, 0, 1], ["AN", 3, 2, 6], ["CA", 3, 4, 0], ["CI", 2], ["FSP", 1, 1, 0, 1], ["AN", 1, 2, 6], ["CA", 1, 4, 0], ["AN", 4, 0, 5], ["PR", 1, 0, "auto_unroll_max_step$512"], ["PR", 3, 0, "auto_unroll_max_step$512"]]]], "r": [[4.87396e-06], 0, 1.30575, 1606984701], "v": "v0.5"} -{"i": [["[\"9847f8cc0b305137f49f2c5c0c8ab25d\", 1, 512, 1000, 512, 1000, 1, 1000]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [1, 50, 1, 1], 1], ["SP", 2, 10, 512, [1, 16], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 32, [1], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 32, [1], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"]]]], "r": [[2.25155e-05], 0, 1.5128, 1606984719], "v": "v0.5"} -{"i": [["[\"69115f188984ae34ede37c3b8ca40b43\", 1, 7, 7, 512, 1, 1, 1, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 512, [2], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["FU", 1, [0, 1, 2, 3]], ["SP", 1, 0, 512, [32], 1], ["AN", 1, 0, 5], ["AN", 1, 1, 6], ["PR", 1, 0, "auto_unroll_max_step$64"]]]], "r": [[3.91068e-06], 0, 1.63708, 1606984742], "v": "v0.5"} -{"i": [["[\"ad6cecbf5d85cb1cda3c2bb7af170211\", 1, 7, 7, 512, 4, 4, 512, 512, 1, 7, 7, 512, 1, 1, 1, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [2], 1], ["SP", 8, 4, 512, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 16, [4, 1, 4, 1], 1], ["SP", 6, 15, 512, [2, 8, 1, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [1], 1], ["SP", 4, 4, 512, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 2, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000190231], 0, 1.95863, 1606984773], "v": "v0.5"} -{"i": [["[\"3a69f9fbc63760d99e36b4c17b3bfc57\", 1, 7, 7, 512, 4, 4, 512, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 16, [4, 2, 2, 1], 1], ["SP", 6, 15, 512, [1, 16, 2, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000218188], 0, 2.05807, 1606984806], "v": "v0.5"} -{"i": [["[\"d730bcd28f0920f6b97245e2a11bd8d6\", 1, 7, 7, 512, 4, 4, 512, 512, 1, 7, 7, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [1], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [2, 1, 1, 8], 1], ["SP", 6, 15, 512, [1, 16, 1, 2], 1], ["SP", 6, 20, 512, [2, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [2], 1], ["SP", 4, 4, 512, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 256, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000165484], 0, 2.76154, 1606984831], "v": "v0.5"} -{"i": [["[\"f3b6c10fcc6ce01ff01add933e4d21e9\", 1, 14, 14, 256, 4, 4, 256, 256, 1, 14, 14, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 49, [1, 1, 1, 7], 1], ["SP", 6, 15, 256, [1, 128, 1, 2], 1], ["SP", 6, 20, 256, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000157488], 0, 2.05375, 1606984883], "v": "v0.5"} -{"i": [["[\"b8b52b9be9df6102466a22a014c44c1f\", 1, 14, 14, 256, 4, 4, 256, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 7, 7, 1], 1], ["SP", 6, 15, 256, [1, 32, 1, 2], 1], ["SP", 6, 20, 256, [8, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 224, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.00011824], 0, 1.84964, 1606984912], "v": "v0.5"} -{"i": [["[\"d374e472bd9d8164892b9e28a0a8cb59\", 1, 14, 14, 256, 4, 4, 256, 256, 1, 14, 14, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [7], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 4, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 49, [1, 1, 7, 1], 1], ["SP", 6, 15, 256, [4, 8, 1, 1], 1], ["SP", 6, 20, 256, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [49], 1], ["SP", 4, 4, 256, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[8.67244e-05], 0, 1.93124, 1606984935], "v": "v0.5"} -{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 28, 28, 128, 3, 3, 128, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [2, 7, 1, 1], 1], ["SP", 3, 10, 14, [1, 7, 2, 1], 1], ["SP", 3, 15, 256, [2, 2, 1, 4], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 128, [4, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 96, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 36, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[9.20105e-05], 0, 1.88263, 1606984952], "v": "v0.5"} -{"i": [["[\"c4500b4e2fd04e695c32d2f31bbdc14a\", 1, 28, 28, 128, 4, 4, 128, 128, 1, 28, 28, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 7, 1, 7], 1], ["SP", 6, 15, 128, [1, 4, 1, 16], 1], ["SP", 6, 20, 128, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000102747], 0, 2.2858, 1606984979], "v": "v0.5"} -{"i": [["[\"e4cdf917b876dbdd64488c3818d9c141\", 1, 28, 28, 128, 4, 4, 128, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 4], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 49, 1, 1], 1], ["SP", 6, 15, 128, [2, 8, 1, 1], 1], ["SP", 6, 20, 128, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000133211], 0, 2.07337, 1606985017], "v": "v0.5"} -{"i": [["[\"dac19035dd5fe9424ee8617421b9c817\", 1, 28, 28, 128, 4, 4, 128, 128, 1, 28, 28, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 2, 7, 1], 1], ["SP", 6, 15, 128, [1, 2, 2, 2], 1], ["SP", 6, 20, 128, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 128, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000150142], 0, 1.90539, 1606985042], "v": "v0.5"} -{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 56, 56, 64, 3, 3, 64, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 1, 2, 1], 1], ["SP", 3, 10, 28, [1, 7, 2, 2], 1], ["SP", 3, 15, 128, [1, 8, 8, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 64, [4, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 576, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 360, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000101548], 0, 1.92449, 1606985059], "v": "v0.5"} -{"i": [["[\"1e3c4211ffd2f2db91078ae4d04b779d\", 1, 56, 56, 64, 6, 6, 64, 64, 1, 56, 56, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 14, 1, 1], 1], ["SP", 6, 15, 64, [2, 2, 4, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [7], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[5.64548e-05], 0, 3.15692, 1606985088], "v": "v0.5"} -{"i": [["[\"b818b53148cd450f86569dfc3e04cb8a\", 1, 56, 56, 64, 6, 6, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 3, 2, 1], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 4, 1], 1], ["SP", 6, 15, 64, [1, 8, 1, 4], 1], ["SP", 6, 20, 64, [1, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [1], 1], ["SP", 4, 4, 64, [4], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000135574], 0, 2.88002, 1606985120], "v": "v0.5"} -{"i": [["[\"3ea73fb9b0364374730d09e068821f95\", 1, 56, 56, 64, 6, 6, 64, 64, 1, 56, 56, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 64, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 3], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 1, 4], 1], ["SP", 6, 15, 64, [1, 8, 1, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 64, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 48, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 96, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000115802], 0, 4.06441, 1606985158], "v": "v0.5"} -{"i": [["[\"a5612fdeb9db4d579a75ec225ea4c06a\", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 4], ["CI", 1], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 200704, [64], 1], ["AN", 5, 0, 5], ["AN", 5, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 200704, [64], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["PR", 2, 0, "auto_unroll_max_step$16"]]]], "r": [[2.00968e-05], 0, 1.53065, 1606985193], "v": "v0.5"} -{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 224, 224, 3, 7, 7, 3, 64, 1, 1, 1, 64, 1, 112, 112, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [1, 2, 7, 1], 1], ["SP", 3, 10, 112, [1, 7, 1, 1], 1], ["SP", 3, 15, 64, [1, 8, 4, 1], 1], ["SP", 3, 20, 7, [7, 1], 1], ["SP", 3, 23, 7, [1, 7], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 84, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 273, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[7.14326e-05], 0, 2.05623, 1606985220], "v": "v0.5"} -{"i": [["[\"7006235cfc29b73be524cf390ed5a977\", 1, 56, 56, 64, 1, 1, 64, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [1, 2, 2, 2], 1], ["SP", 3, 10, 56, [1, 7, 1, 2], 1], ["SP", 3, 15, 64, [1, 16, 1, 4], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 64, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 256, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[1.17113e-05], 0, 1.9863, 1606985239], "v": "v0.5"} -{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 56, 56, 64, 1, 1, 64, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 7, 1, 1], 1], ["SP", 3, 10, 28, [1, 2, 1, 7], 1], ["SP", 3, 15, 128, [8, 8, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 208, [2], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$16"]]]], "r": [[1.76965e-05], 0, 1.63284, 1606985253], "v": "v0.5"} -{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 28, 28, 128, 1, 1, 128, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 1, 1], 1], ["SP", 3, 10, 14, [2, 1, 7, 1], 1], ["SP", 3, 15, 256, [2, 64, 1, 2], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [1, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 52, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$0"]]]], "r": [[3.05015e-05], 0, 1.59532, 1606985280], "v": "v0.5"} -{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 14, 14, 256, 1, 1, 256, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 7, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 512, [4, 128, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 2704, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[2.18808e-05], 0, 1.88033, 1606985298], "v": "v0.5"} -{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 14, 14, 256, 3, 3, 256, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [7, 1, 1, 1], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 4, 1, 1], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 256, [8, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 144, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 144, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000190239], 0, 2.28266, 1606985323], "v": "v0.5"} +{"i": [["[\"f19692ed81d032b1697c08adee62f9a5\", [1, 28, 28, 128], [4, 4, 128, 128], [1, 28, 28, 128], [1, 1, 1, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 128, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 2, 2], 1], ["SP", 6, 10, 196, [1, 49, 2, 1], 1], ["SP", 6, 15, 128, [4, 2, 1, 1], 1], ["SP", 6, 20, 128, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [4], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000186843], 0, 0.965096, 1650980656], "v": "v0.6"} +{"i": [["[\"2d10de6646307f0e3e5cf4b31c20e69b\", [1, 56, 56, 64], [1, 1, 64, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [1, 2, 2, 1], 1], ["SP", 3, 10, 56, [1, 8, 1, 7], 1], ["SP", 3, 15, 64, [1, 16, 4, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [32, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 4, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 14, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.37742e-05], 0, 1.18571, 1650980663], "v": "v0.6"} +{"i": [["[\"a3df19e5b88592ef5a9ce584a1ca3010\", [1, 7, 7, 512], [4, 4, 512, 512], [1, 7, 7, 512], [1, 1, 1, 512], [1, 1, 1, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [1], 1], ["SP", 8, 4, 512, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 2, 1], 1], ["SP", 6, 5, 4, [2, 1, 2, 1], 1], ["SP", 6, 10, 16, [1, 8, 2, 1], 1], ["SP", 6, 15, 512, [2, 8, 1, 2], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [1], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [2], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000317285], 0, 0.910312, 1650980674], "v": "v0.6"} +{"i": [["[\"0fad1b42d0d33418e0a8d15d3bbad3c9\", [1, 56, 56, 64], [1, 1, 64, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 7, 2, 2], 1], ["SP", 3, 10, 28, [2, 7, 1, 2], 1], ["SP", 3, 15, 128, [2, 8, 4, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [1, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 21, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000229624], 0, 0.97359, 1650980681], "v": "v0.6"} +{"i": [["[\"0bcf718c0e6566bcd6c3b1437a3b6291\", [1, 28, 28, 128], [4, 4, 128, 128], [1, 1, 1, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 2, 2, 1], 1], ["SP", 6, 10, 196, [1, 7, 4, 1], 1], ["SP", 6, 15, 128, [1, 8, 2, 1], 1], ["SP", 6, 20, 128, [1, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 128, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 4, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000142796], 0, 0.851287, 1650980693], "v": "v0.6"} +{"i": [["[\"1097323f3970e5c881ad3a0028ca79cb\", [1, 14, 14, 256], [4, 4, 256, 256], [1, 1, 1, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 2, 1], 1], ["SP", 6, 5, 4, [1, 2, 1, 1], 1], ["SP", 6, 10, 49, [7, 1, 1, 1], 1], ["SP", 6, 15, 256, [1, 64, 1, 2], 1], ["SP", 6, 20, 256, [1, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [1], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 4, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 2, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000289122], 0, 1.35723, 1650980705], "v": "v0.6"} +{"i": [["[\"d78e8eb6021c4cdda0ad7775d10f751a\", [1, 7, 7, 512], [4, 4, 512, 512], [1, 7, 7, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [2, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 2, 1, 2], 1], ["SP", 6, 10, 16, [4, 1, 1, 2], 1], ["SP", 6, 15, 512, [1, 8, 1, 2], 1], ["SP", 6, 20, 512, [2, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [64], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000182145], 0, 0.954184, 1650980716], "v": "v0.6"} +{"i": [["[\"7c2a4f1f432f81c44985590780dfb52d\", [1, 56, 56, 64], [6, 6, 64, 64], [1, 1, 1, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 64, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 2], 1], ["SP", 6, 5, 6, [2, 1, 1, 1], 1], ["SP", 6, 10, 196, [1, 2, 98, 1], 1], ["SP", 6, 15, 64, [2, 16, 1, 1], 1], ["SP", 6, 20, 64, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 64, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 392, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00029727], 0, 2.54044, 1650980730], "v": "v0.6"} +{"i": [["[\"64b7ce5264a64cb340d78b444b0325e6\", [1, 14, 14, 256], [4, 4, 256, 256], [1, 14, 14, 256], [1, 1, 1, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [2, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 2, 1, 1], 1], ["SP", 6, 10, 49, [1, 1, 7, 7], 1], ["SP", 6, 15, 256, [4, 16, 2, 1], 1], ["SP", 6, 20, 256, [2, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [8], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000645288], 0, 3.306, 1650980745], "v": "v0.6"} +{"i": [["[\"be3babb9a46e32f66b717a3e2a2d522c\", [1, 7, 7, 512], [1, 1, 1, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 512, [64], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["FU", 1, [0, 1, 2, 3]], ["SP", 1, 0, 512, [32], 1], ["AN", 1, 0, 5], ["AN", 1, 1, 6], ["PR", 1, 0, "auto_unroll_max_step$64"]]]], "r": [[3.49558e-06], 0, 0.880265, 1650980753], "v": "v0.6"} +{"i": [["[\"7d79c516e212fe1d73f5dbb90eaca2cf\", [1, 1000], [1, 1000]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["SP", 4, 1, 1000, [20], 1], ["AN", 4, 2, 6], ["FSP", 3, 1, 0, 1], ["AN", 3, 2, 6], ["CA", 3, 4, 0], ["CI", 2], ["AN", 4, 0, 5], ["AN", 1, 0, 6], ["PR", 1, 0, "auto_unroll_max_step$0"], ["PR", 3, 0, "auto_unroll_max_step$16"]]]], "r": [[1.66218e-05], 0, 1.00389, 1650980756], "v": "v0.6"} +{"i": [["[\"40b1cf1fd37b0ef111b3cc0247302508\", [1, 7, 7, 512], [4, 4, 512, 512], [1, 1, 1, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [8], 1], ["SP", 8, 4, 512, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 16, [1, 4, 1, 4], 1], ["SP", 6, 15, 512, [1, 128, 1, 1], 1], ["SP", 6, 20, 512, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [1], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00019327], 0, 0.828601, 1650980768], "v": "v0.6"} +{"i": [["[\"0fad1b42d0d33418e0a8d15d3bbad3c9\", [1, 28, 28, 128], [1, 1, 128, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 2, 1, 1], 1], ["SP", 3, 10, 14, [1, 1, 2, 1], 1], ["SP", 3, 15, 256, [4, 8, 1, 4], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [2, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 32, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 24, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[1.57402e-05], 0, 1.28219, 1650980774], "v": "v0.6"} +{"i": [["[\"25577781e50c611c2e45e73c1cb3a6ca\", [1, 28, 28, 128], [4, 4, 128, 128], [1, 28, 28, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [7], 1], ["SP", 8, 4, 128, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 2], 1], ["SP", 6, 5, 4, [1, 2, 1, 2], 1], ["SP", 6, 10, 196, [7, 7, 2, 2], 1], ["SP", 6, 15, 128, [1, 4, 2, 1], 1], ["SP", 6, 20, 128, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [7], 1], ["SP", 4, 4, 128, [4], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000287883], 0, 1.48484, 1650980787], "v": "v0.6"} +{"i": [["[\"07f9fcad27bdd3233f86fe35a5185d33\", [1, 28, 28, 128], [3, 3, 128, 256], [1, 1, 1, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 2, 1], 1], ["SP", 3, 10, 14, [1, 1, 2, 7], 1], ["SP", 3, 15, 256, [1, 16, 1, 1], 1], ["SP", 3, 20, 3, [1, 1], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 128, [4, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 648, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000155192], 0, 0.945575, 1650980794], "v": "v0.6"} +{"i": [["[\"07f9fcad27bdd3233f86fe35a5185d33\", [1, 14, 14, 256], [3, 3, 256, 512], [1, 1, 1, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 7, 1, 1], 1], ["SP", 3, 10, 7, [7, 1, 1, 1], 1], ["SP", 3, 15, 512, [1, 16, 2, 8], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 256, [1, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 576, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 36, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00083305], 0, 2.13994, 1650980802], "v": "v0.6"} +{"i": [["[\"6c4f6234946e16bcf9e48bdf289f9200\", [1, 56, 56, 64], [6, 6, 64, 64], [1, 56, 56, 64], [1, 1, 1, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 64, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [2, 1, 3, 1], 1], ["SP", 6, 5, 6, [1, 6, 1, 1], 1], ["SP", 6, 10, 196, [1, 28, 1, 1], 1], ["SP", 6, 15, 64, [1, 1, 1, 4], 1], ["SP", 6, 20, 64, [4, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 96, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 24, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000112836], 0, 1.67377, 1650980816], "v": "v0.6"} +{"i": [["[\"07f9fcad27bdd3233f86fe35a5185d33\", [1, 224, 224, 3], [7, 7, 3, 64], [1, 1, 1, 64], [1, 112, 112, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [1, 2, 28, 1], 1], ["SP", 3, 10, 112, [7, 1, 1, 1], 1], ["SP", 3, 15, 64, [1, 32, 1, 1], 1], ["SP", 3, 20, 7, [1, 7], 1], ["SP", 3, 23, 7, [7, 1], 1], ["SP", 3, 26, 3, [1, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 49, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 91, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000303402], 0, 1.28361, 1650980824], "v": "v0.6"} +{"i": [["[\"10b8215aaf2e14d47d40b4093e6f41a0\", [1, 56, 56, 64], [6, 6, 64, 64], [1, 56, 56, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [7], 1], ["SP", 8, 4, 64, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 1], 1], ["SP", 6, 5, 6, [1, 3, 1, 1], 1], ["SP", 6, 10, 196, [1, 14, 1, 1], 1], ["SP", 6, 15, 64, [8, 2, 2, 1], 1], ["SP", 6, 20, 64, [4, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 64, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 32, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[6.50144e-05], 0, 3.25197, 1650980839], "v": "v0.6"} +{"i": [["[\"7f3fee61bc3c2604395f5d343b840b7c\", [1, 14, 14, 256], [4, 4, 256, 256], [1, 14, 14, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [49], 1], ["SP", 8, 4, 256, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 2, 1, 2], 1], ["SP", 6, 10, 49, [1, 7, 1, 1], 1], ["SP", 6, 15, 256, [2, 32, 4, 1], 1], ["SP", 6, 20, 256, [4, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000233087], 0, 0.828703, 1650980851], "v": "v0.6"} +{"i": [["[\"0fad1b42d0d33418e0a8d15d3bbad3c9\", [1, 14, 14, 256], [1, 1, 256, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 1, 1], 1], ["SP", 3, 10, 7, [7, 1, 1, 1], 1], ["SP", 3, 15, 512, [2, 128, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [8, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 208, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.42677e-05], 0, 1.28859, 1650980857], "v": "v0.6"} +{"i": [["[\"affd3c4a65f665e451a06d65bf32750d\", [1, 112, 112, 64], [1, 1, 1, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 4], ["CI", 1], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 200704, [1], 1], ["AN", 5, 0, 5], ["AN", 5, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 200704, [2], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["PR", 2, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000265616], 0, 0.615762, 1650980871], "v": "v0.6"} +{"i": [["[\"00a059b856ac30ac172b6252254479a6\", [1, 512], [1000, 512], [1, 1000], [1, 1000]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [4, 50, 1, 1], 1], ["SP", 2, 10, 512, [2, 4], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 32, [1], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 8, [2], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"]]]], "r": [[7.0238e-05], 0, 0.673282, 1650980874], "v": "v0.6"} +{"i": [["[\"07f9fcad27bdd3233f86fe35a5185d33\", [1, 56, 56, 64], [3, 3, 64, 128], [1, 1, 1, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 2, 7, 2], 1], ["SP", 3, 10, 28, [2, 7, 1, 2], 1], ["SP", 3, 15, 128, [1, 8, 8, 1], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [3, 1], 1], ["SP", 3, 26, 64, [1, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 144, [4], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 145, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00116892], 0, 1.6731, 1650980882], "v": "v0.6"} diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py index 6f331499b042..e59e78f57154 100644 --- a/python/tvm/auto_scheduler/measure.py +++ b/python/tvm/auto_scheduler/measure.py @@ -780,7 +780,7 @@ def register(myf): return register -def prepare_input_map(args): +def prepare_input_map(args, workload_key=None): """This function deals with special task inputs. Map the input Tensor of a TVM subgraph to a specific buffer name in the global buffer map. @@ -789,6 +789,11 @@ def prepare_input_map(args): args : List[Tensor] Input/output Tensor of a TVM subgraph. + workload_key: Optional[str] + The workload for which these inputs are being prepared. This + is used to identify if an input is being provided by (see + `register_task_input_buffer`). + Returns ------- Dict[Tensor, str] : @@ -803,13 +808,19 @@ def prepare_input_map(args): global TASK_INPUT_CHECK_FUNC_REGISTRY + from .search_task import TASK_INPUT_BUFFER_TABLE + # A dict that maps the input tensor arg to a buffer name tensor_input_map = {} # Case 0: Check placeholder name for arg in args: if isinstance(arg.op, tvm.te.PlaceholderOp): - if arg.op.name != "placeholder": + if ( + workload_key + and workload_key in TASK_INPUT_BUFFER_TABLE + and arg.op.name in TASK_INPUT_BUFFER_TABLE[workload_key] + ): tensor_input_map[arg] = arg.op.name # Case 1: Check specific tensor inputs @@ -843,7 +854,7 @@ def prepare_runner_args(inp, build_res): from .search_task import get_task_input_buffer # lazily import to avoid recursive dependency task_input_names = inp.task.task_input_names - tensor_input_map = prepare_input_map(build_res.args) + tensor_input_map = prepare_input_map(build_res.args, inp.task.workload_key) if not task_input_names: tensor_input_map = {} args = [] diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py index 9541232a6a38..52c7f44fcede 100644 --- a/python/tvm/auto_scheduler/relay_integration.py +++ b/python/tvm/auto_scheduler/relay_integration.py @@ -336,7 +336,8 @@ def auto_schedule_topi(func_name, outs): logger.info("Failed to create a ComputeDAG for auto_scheduler: %s", str(err)) return None - key = register_workload_tensors(dag.workload_key(), io_tensors) + workload_key = dag.workload_key() + key = register_workload_tensors(workload_key, io_tensors) target = tvm.target.Target.current() dispatch_ctx = DispatchContext.current @@ -356,7 +357,7 @@ def auto_schedule_topi(func_name, outs): # in the task extraction mode if has_complex_op or env.tracing_mode == TracingMode.EXTRACT_TASK: env.add_workload_key(func_name, key) - input_map = prepare_input_map(io_tensors) + input_map = prepare_input_map(io_tensors, workload_key) if input_map: env.add_workload_input_names(key, list(input_map.values())) elif env.tracing_mode == TracingMode.PREPARE_LAYOUT_REWRITE: diff --git a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py index a3d46170dfca..f5c8994bec77 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py @@ -268,7 +268,7 @@ def extract_param_base_addresses(mod, buffer_info, scratch_region_map) -> List[u size_bytes = element_size_bytes * np.prod(list(buffer.shape)) base_addresses.append( util.BaseAddress( - param.name, + param.name.replace("-", "_"), idx, _get_region(buffer_info[param].btype, param, scratch_region_map), size_bytes, diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc index 1d7566ebe2bd..a8eb6a58105f 100644 --- a/src/relay/backend/te_compiler_cache.cc +++ b/src/relay/backend/te_compiler_cache.cc @@ -131,7 +131,8 @@ class LowerToTECompute : public backend::MemoizedExprTranslatorparams) { Array inputs; for (const auto& ttype : FlattenTupleType(param->checked_type())) { - tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype); + tvm::te::Tensor tensor = + tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype, param->vid->name_hint); inputs.push_back(tensor); fn_inputs_.push_back(tensor); } @@ -478,7 +479,8 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator> for (const auto& ttype : FlattenTupleType(param->checked_type())) { // Add data placeholder (in case we discover we need it below) Shape shape = GetShape(ttype->shape); - tvm::te::Tensor data_tensor = tvm::te::placeholder(shape, ttype->dtype); + tvm::te::Tensor data_tensor = + tvm::te::placeholder(shape, ttype->dtype, "data_" + param->vid->name_hint); data_inputs.push_back(data_tensor); // Add shape placeholder (in case we discover we need it below) int64_t ndim = shape.size(); @@ -486,7 +488,8 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator> if (ndim > 0) { sshape.push_back(tvm::Integer(ndim)); } - tvm::te::Tensor shape_tensor = tvm::te::placeholder(sshape, DataType::Int(64)); + tvm::te::Tensor shape_tensor = + tvm::te::placeholder(sshape, DataType::Int(64), "shape_" + param->vid->name_hint); shape_inputs.push_back(shape_tensor); } param_data_[param] = data_inputs; From a0cbefbe9568468a35bc3dce7d23a143da3008b8 Mon Sep 17 00:00:00 2001 From: Elen Kalda Date: Wed, 14 Sep 2022 17:16:57 +0100 Subject: [PATCH 164/704] [CI] Set USE_CMSISNN and USE_ETHOSU off in task_config_build_cpu.sh (#12456) The dependencies for these have moved into ci_cortexm Docker image, so there is not much point in building them for ci_cpu as we can't run the associated tests. --- tests/scripts/task_config_build_cpu.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh index 7f48839f23c0..8d5a2a95bb89 100755 --- a/tests/scripts/task_config_build_cpu.sh +++ b/tests/scripts/task_config_build_cpu.sh @@ -51,11 +51,11 @@ echo set\(USE_TENSORFLOW_PATH \"/tensorflow\"\) >> config.cmake echo set\(USE_FLATBUFFERS_PATH \"/flatbuffers\"\) >> config.cmake echo set\(USE_ETHOSN /opt/arm/ethosn-driver\) >> config.cmake echo set\(USE_ETHOSN_HW OFF\) >> config.cmake -echo set\(USE_CMSISNN ON\) >> config.cmake +echo set\(USE_CMSISNN OFF\) >> config.cmake echo set\(USE_VITIS_AI ON\) >> config.cmake echo set\(USE_VERILATOR ON\) >> config.cmake echo set\(USE_LIBBACKTRACE ON\) >> config.cmake echo set\(USE_CCACHE OFF\) >> config.cmake -echo set\(USE_ETHOSU ON\) >> config.cmake +echo set\(USE_ETHOSU OFF\) >> config.cmake echo set\(USE_UMA ON\) >> config.cmake echo set\(SUMMARIZE ON\) >> config.cmake From 3d7439eb0bf3d0a2253e7011b7f115499b7f4f33 Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Wed, 14 Sep 2022 12:36:10 -0700 Subject: [PATCH 165/704] [TVMScript] IRBuilder methods for `PrimFunc` (#12755) This PR introduces remaining IRBuilder methods for `PrimFunc`. Co-authored-by: yongwww --- include/tvm/script/ir_builder/tir/ir.h | 126 ++++ python/tvm/script/ir_builder/tir/ir.py | 629 +++++++++++++++++- src/script/ir_builder/tir/ir.cc | 194 ++++++ src/script/ir_builder/tir/utils.h | 32 + .../unittest/test_tvmscript_ir_builder_tir.py | 44 +- 5 files changed, 1022 insertions(+), 3 deletions(-) diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h index 615ce90383dd..aaa5442eede3 100644 --- a/include/tvm/script/ir_builder/tir/ir.h +++ b/include/tvm/script/ir_builder/tir/ir.h @@ -28,12 +28,111 @@ namespace script { namespace ir_builder { namespace tir { +using tvm::tir::Buffer; +using tvm::tir::Var; + +/*! + * \brief The buffer declaration function. + * \param shape The type of the buffer prior to flattening. + * \param dtype The data type in the content of the buffer. + * \param buffer_name The name of the buffer. + * \param data The pointer to the head of the data. + * \param strides The strides of each dimension. + * \param elem_offset The offset in terms of number of dtype elements (including lanes). + * \param storage_scope The optional storage scope of buffer data pointer. + * \param align The alignment requirement of data pointer in bytes. + * \param offset_factor The factor of elem_offset field. + * \param buffer_type The buffer type. + * \param axis_separators The separators between input axes when generating flattened output axes. + * \return The declared buffer. + */ +Buffer BufferDecl(Array shape, DataType dtype, String buffer_name, Optional data, + Optional> strides, Optional elem_offset, + String storage_scope, int align, int offset_factor, String buffer_type, + Optional> axis_separators); + /*! * \brief The primitive function statement. * \return The PrimFuncFrame. */ PrimFuncFrame PrimFunc(); +/*! + * \brief The PrimFunc variable arguments adding function. + * \param name The name of the variable. + * \param var The variable argument. + * \return The variable. + */ +Var Arg(String name, Var var); + +/*! + * \brief The PrimFunc buffer arguments adding function. + * \param name The name of the buffer. + * \param buffer The buffer argument. + * \return The buffer. + */ +Buffer Arg(String name, Buffer buffer); + +/*! + * \brief The PrimFunc naming statement. + * \param name The name of the PrimFunc. + */ +void FuncName(String name); + +/*! + * \brief The PrimFunc annotation statement. + * \param attrs The annotations of the PrimFunc. + */ +void FuncAttrs(Map attrs); + +/*! + * \brief The PrimFunc return type statement. + * \param ret_type The return type of the PrimFunc. + * \return The return type. + */ +Type FuncRet(Type ret_type); + +/*! + * \brief The buffer match statement. + * \param param The parameter of the PrimFunc to match. + * \param shape The type of the buffer prior to flattening. + * \param dtype The data type in the content of the buffer. + * \param data The pointer to the head of the data. + * \param strides The strides of each dimension. + * \param elem_offset The offset in terms of number of dtype elements (including lanes). + * \param storage_scope The optional storage scope of buffer data pointer. + * \param align The alignment requirement of data pointer in bytes. + * \param offset_factor The factor of elem_offset field. + * \param buffer_type The buffer type. + * \param axis_separators The separators between input axes when generating flattened output axes. + * \return The matched buffer. + */ +Buffer MatchBuffer(ObjectRef param, Array shape, DataType dtype = DataType::Float(32), + Optional data = NullOpt, Array strides = {}, + PrimExpr elem_offset = PrimExpr(), String storage_scope = "global", + int align = -1, int offset_factor = 0, String buffer_type = "default", + Array axis_separators = {}); + +/*! + * \brief The pre-flattened buffer statement. + * \param postflattened_buffer The original buffer to be flattened. + * \param shape The type of the buffer prior to flattening. + * \param dtype The data type in the content of the buffer. + * \param data The pointer to the head of the data. + * \param strides The strides of each dimension. + * \param elem_offset The offset in terms of number of dtype elements (including lanes). + * \param storage_scope The optional storage scope of buffer data pointer. + * \param align The alignment requirement of data pointer in bytes. + * \param offset_factor The factor of elem_offset field. + * \param buffer_type The buffer type. + * \param axis_separators The separators between input axes when generating flattened output axes. + */ +void PreflattenedBuffer(Buffer postflattened_buffer, Array shape, + DataType dtype = DataType::Float(32), Optional data = NullOpt, + Array strides = {}, PrimExpr elem_offset = PrimExpr(), + String storage_scope = "global", int align = -1, int offset_factor = 0, + String buffer_type = "default", Array axis_separators = {}); + /*! * \brief The block declaration statement. * \param name The name of the block. @@ -48,6 +147,33 @@ BlockFrame Block(String name, bool no_realize = false); */ void Evaluate(PrimExpr value); +#define TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(FuncName, DType) \ + inline PrimExpr FuncName(Optional expr = NullOpt) { \ + DataType dtype = DType; \ + return expr.defined() ? tvm::cast(dtype, expr.value()) : tvm::tir::Var("", dtype); \ + } + +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int8, DataType::Int(8)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int16, DataType::Int(16)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int32, DataType::Int(32)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int64, DataType::Int(64)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(UInt8, DataType::UInt(8)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(UInt16, DataType::UInt(16)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(UInt32, DataType::UInt(32)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(UInt64, DataType::UInt(64)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Float8, DataType::Float(8)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Float16, DataType::Float(16)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Float32, DataType::Float(32)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Float64, DataType::Float(64)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int32x4, DataType::Int(32, 4)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int32x8, DataType::Int(32, 8)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Int32x16, DataType::Int(32, 16)); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Boolean, DataType::Bool()); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Handle, DataType::Handle()); +TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(Void, DataType::Void()); + +#undef TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST + } // namespace tir } // namespace ir_builder } // namespace script diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py index 7ba2f6df9418..63fd1291f4bc 100644 --- a/python/tvm/script/ir_builder/tir/ir.py +++ b/python/tvm/script/ir_builder/tir/ir.py @@ -17,11 +17,89 @@ # pylint: disable=missing-docstring """IRBuilder for TIR""" -from tvm.tir import PrimExpr, StringImm +from numbers import Integral +from typing import Any, Dict, List, Optional, Union, Tuple + +from tvm.ir import Type +from tvm.tir import ( + Buffer, + BufferLoad, + BufferRegion, + PrimExpr, + StringImm, + Var, +) from . import _ffi_api, frame +def buffer_decl( + shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral], + dtype: str = "float32", + data: Var = None, + strides: List[PrimExpr] = None, + elem_offset: PrimExpr = None, + scope: str = "", + align: int = 0, + offset_factor: int = 0, + buffer_type: str = "", + axis_separators: List[int] = None, +) -> Buffer: + """The buffer declaration function. + + Parameters + ---------- + shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral] + The type of the buffer prior to flattening. + + dtype : str + The data type in the content of the buffer. + + data : Var + The pointer to the head of the data. + + strides : List[PrimExpr] + The strides of each dimension. + + elem_offset : PrimExpr + The offset in terms of number of dtype elements (including lanes). + + scope : str + The optional storage scope of buffer data pointer. + + align : int + The alignment requirement of data pointer in bytes. + + offset_factor : int + The factor of elem_offset field. + + buffer_type : str + The buffer type. + + axis_separators : List[int] + The separators between input axes when generating flattened output axes. + + Returns + ------- + res : Buffer + The declared buffer. + """ + shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape + return _ffi_api.BufferDecl( # pylint: disable=no-member # type: ignore + shape, + dtype, + "", + data, + strides, + elem_offset, + scope, + align, + offset_factor, + buffer_type, + axis_separators, + ) + + def prim_func() -> frame.PrimFuncFrame: """The primitive function statement. @@ -33,6 +111,220 @@ def prim_func() -> frame.PrimFuncFrame: return _ffi_api.PrimFunc() # pylint: disable=no-member # type: ignore +def arg(name: str, obj: Union[Var, Buffer]) -> Union[Var, Buffer]: + """The PrimFunc arguments adding function. + + Parameters + ---------- + name : str + The name of the argument. + + var : Union[Var, Buffer] + The argument of Var or Buffer. + + Returns + ------- + res : Union[Var, Buffer] + The argument. + """ + return _ffi_api.Arg(name, obj) # pylint: disable=no-member # type: ignore + + +def func_name(name: str) -> None: + """The PrimFunc naming statement. + + Parameters + ---------- + name : str + The name of the PrimFunc. + """ + _ffi_api.FuncName(name) # pylint: disable=no-member # type: ignore + + +def func_attr(attrs: Dict[str, Any]) -> None: + """The PrimFunc annotation statement. + + Parameters + ---------- + attrs : Dict[str, Any] + The annotations of the PrimFunc. + """ + _ffi_api.FuncAttrs(attrs) # pylint: disable=no-member # type: ignore + + +def func_ret(ret_type: Type) -> Type: + """The PrimFunc return type statement. + + Parameters + ---------- + ret_type : Type + The return type of the PrimFunc. + + Returns + ------- + res : Type + The return type. + """ + return _ffi_api.FuncRet(ret_type) # pylint: disable=no-member # type: ignore + + +def match_buffer( + param: Union[Var, BufferLoad, BufferRegion], + shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral], + dtype: str = "float32", + data: Var = None, + strides: List[PrimExpr] = None, + elem_offset: PrimExpr = None, + scope: str = "global", + align: int = -1, + offset_factor: int = 0, + buffer_type: str = "default", + axis_separators: List[int] = None, +) -> Buffer: + """The buffer match function. + + Note + ---- + This function will perform different behavior, depending on the type of param. + If the param is a var in function parameter, it will create a buffer from DLTensor. + Else if the param is a subregion of other buffers, then create a subregion match inside a block. + + Example + ------- + Match buffer from function parameter + .. code-block:: python + A = T.match_buffer(a, (128, 128), dtype="float32") + + Match buffer from Buffer subregion + .. code-block:: python + A = T.match_buffer(B[0:128, i * 128 : i * 128 + 128], (128, 128), dtype="float32") + + Parameters + ---------- + param : Union[Var, BufferLoad, BufferRegion] + The parameter of the PrimFunc to match. + + shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral] + The type of the buffer prior to flattening. + + dtype : str + The data type in the content of the buffer. + + data : Var + The pointer to the head of the data. + + strides : List[PrimExpr] + The strides of each dimension. + + elem_offset : PrimExpr + The offset in terms of number of dtype elements (including lanes). + + scope : str + The optional storage scope of buffer data pointer. + + align : int + The alignment requirement of data pointer in bytes. + + offset_factor : int + The factor of elem_offset field. + + buffer_type : str + The buffer type. + + axis_separators : List[int] + The separators between input axes when generating flattened output axes. + + Returns + ------- + res : Buffer + The matched buffer. + """ + shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape + if strides is None: + strides = [] + return _ffi_api.MatchBuffer( # pylint: disable=no-member # type: ignore + param, + shape, + dtype, + data, + strides, + elem_offset, + scope, + align, + offset_factor, + buffer_type, + axis_separators, + ) + + +def preflattened_buffer( + postflattened: Buffer, + shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral], + dtype: str = "float32", + data: Var = None, + strides: List[PrimExpr] = None, + elem_offset: PrimExpr = None, + scope: str = "global", + align: int = -1, + offset_factor: int = 0, + buffer_type: str = "default", + axis_separators: List[int] = None, +) -> None: + """The pre-flattened buffer statement. + + Parameters + ---------- + postflattened : Buffer + The original buffer to be flattened. + + shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral] + The type of the buffer prior to flattening. + + dtype : str + The data type in the content of the buffer. + + data : Var + The pointer to the head of the data. + + strides : List[PrimExpr] + The strides of each dimension. + + elem_offset : PrimExpr + The offset in terms of number of dtype elements (including lanes). + + scope : str + The optional storage scope of buffer data pointer. + + align : int + The alignment requirement of data pointer in bytes. + + offset_factor : int + The factor of elem_offset field. + + buffer_type : str + The buffer type. + + axis_separators : List[int] + The separators between input axes when generating flattened output axes. + """ + shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape + if strides is None: + strides = [] + _ffi_api.PreflattenedBuffer( # pylint: disable=no-member # type: ignore + postflattened, + shape, + dtype, + data, + strides, + elem_offset, + scope, + align, + offset_factor, + buffer_type, + axis_separators, + ) + + def block(name: str = "", no_realize: bool = False) -> frame.BlockFrame: """The block declaration statement. @@ -65,11 +357,344 @@ def evaluate(value: PrimExpr) -> None: return _ffi_api.Evaluate(value) # pylint: disable=no-member # type: ignore +def int8(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type int8 or cast expression to type int8. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type int8 or casted expression with type int8. + """ + return _ffi_api.Int8(expr) # pylint: disable=no-member # type: ignore + + +def int16(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type int16 or cast expression to type int16. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type int16 or casted expression with type int16. + """ + return _ffi_api.Int16(expr) # pylint: disable=no-member # type: ignore + + +def int32(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type int32 or cast expression to type int32. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type int32 or casted expression with type int32. + """ + return _ffi_api.Int32(expr) # pylint: disable=no-member # type: ignore + + +def int64(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type int64 or cast expression to type int64. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type int64 or casted expression with type int64. + """ + return _ffi_api.Int64(expr) # pylint: disable=no-member # type: ignore + + +def uint8(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type uint8 or cast expression to type uint8. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type uint8 or casted expression with type uint8. + """ + return _ffi_api.UInt8(expr) # pylint: disable=no-member # type: ignore + + +def uint16(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type uint16 or cast expression to type uint16. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type uint16 or casted expression with type uint16. + """ + return _ffi_api.UInt16(expr) # pylint: disable=no-member # type: ignore + + +def uint32(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type uint32 or cast expression to type uint32. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type uint32 or casted expression with type uint32. + """ + return _ffi_api.UInt32(expr) # pylint: disable=no-member # type: ignore + + +def uint64(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type uint64 or cast expression to type uint64. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type uint64 or casted expression with type uint64. + """ + return _ffi_api.UInt64(expr) # pylint: disable=no-member # type: ignore + + +def float8(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type float8 or cast expression to type float8. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type float8 or casted expression with type float8. + """ + return _ffi_api.Float8(expr) # pylint: disable=no-member # type: ignore + + +def float16(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type float16 or cast expression to type float16. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type float16 or casted expression with type float16. + """ + return _ffi_api.Float16(expr) # pylint: disable=no-member # type: ignore + + +def float32(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type float32 or cast expression to type float32. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type float32 or casted expression with type float32. + """ + return _ffi_api.Float32(expr) # pylint: disable=no-member # type: ignore + + +def float64(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type float64 or cast expression to type float64. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type float64 or casted expression with type float64. + """ + return _ffi_api.Float64(expr) # pylint: disable=no-member # type: ignore + + +def int32x4(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type int32x4 or cast expression to type int32x4. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type int32x4 or casted expression with type int32x4. + """ + return _ffi_api.Int32x4(expr) # pylint: disable=no-member # type: ignore + + +def int32x8(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type int32x8 or cast expression to type int32x8. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type int32x8 or casted expression with type int32x8. + """ + return _ffi_api.Int32x8(expr) # pylint: disable=no-member # type: ignore + + +def int32x16(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type int32x16 or cast expression to type int32x16. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type int32x16 or casted expression with type int32x16. + """ + return _ffi_api.Int32x16(expr) # pylint: disable=no-member # type: ignore + + +def boolean(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type boolean or cast expression to type boolean. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type boolean or casted expression with type boolean. + """ + return _ffi_api.Boolean(expr) # pylint: disable=no-member # type: ignore + + +def handle(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type handle or cast expression to type handle. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type handle or casted expression with type handle. + """ + return _ffi_api.Handle(expr) # pylint: disable=no-member # type: ignore + + +def void(expr: Optional[PrimExpr] = None) -> PrimExpr: + """Construct a new tir.Var with type void or cast expression to type void. + + Parameters + ---------- + expr: PrimExpr + The expression to be cast. + + Returns + ------- + res : PrimExpr + The new tir.Var with type void or casted expression with type void. + """ + return _ffi_api.Void(expr) # pylint: disable=no-member # type: ignore + + +def var(dtype, name="") -> Var: + """Construct a new tir.Var. + + Parameters + ---------- + dtype: str + The dtype of the Var. + + name: str + The name of the Var. + + Returns + ------- + res : Var + The result tir.Var. + """ + return Var(name, dtype) # pylint: disable=no-member # type: ignore + + # pylint: enable=invalid-name __all__ = [ + "buffer_decl", + "prim_func", + "arg", + "func_name", + "func_attr", + "func_ret", + "match_buffer", + "preflattened_buffer", "block", "evaluate", - "prim_func", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "float8", + "float16", + "float32", + "float64", + "int32x4", + "int32x8", + "int32x16", + "boolean", + "handle", + "void", + "var", ] diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc index 4c2679ae6b56..e2c1218a7e87 100644 --- a/src/script/ir_builder/tir/ir.cc +++ b/src/script/ir_builder/tir/ir.cc @@ -28,6 +28,30 @@ namespace tir { using tvm::tir::IterVar; +Buffer BufferDecl(Array shape, DataType dtype, String buffer_name, Optional data, + Optional> strides, Optional elem_offset, + String storage_scope, int align, int offset_factor, String buffer_type, + Optional> axis_separators) { + Var buffer_data; + if (!data.defined()) { + DataType storage_dtype = dtype; + if (storage_dtype == DataType::Bool()) { + storage_dtype = DataType::Int(8); + } + buffer_data = tvm::tir::Var(buffer_name, PointerType(PrimType(storage_dtype), storage_scope)); + } else { + buffer_data = data.value(); + } + if (!elem_offset.defined() && offset_factor) { + DataType shape_dtype = shape.empty() ? DataType::Int(32) : shape[0]->dtype; + elem_offset = tvm::tir::Var("elem_offset", shape_dtype); + } + return Buffer(buffer_data, dtype, shape, strides.value_or(Array()), + elem_offset.value_or(PrimExpr()), buffer_name, align, offset_factor, + (buffer_type == "auto_broadcast") ? tvm::tir::kAutoBroadcast : tvm::tir::kDefault, + axis_separators.value_or(Array())); +} + PrimFuncFrame PrimFunc() { ObjectPtr n = make_object(); n->name = NullOpt; @@ -41,6 +65,98 @@ PrimFuncFrame PrimFunc() { return PrimFuncFrame(n); } +Var Arg(String name, Var var) { + PrimFuncFrame frame = FindPrimFuncFrame("T.Arg"); + details::Namer::Name(var, name); + frame->args.push_back(var); + return var; +} + +Buffer Arg(String name, Buffer buffer) { + PrimFuncFrame frame = FindPrimFuncFrame("T.Arg"); + details::Namer::Name(buffer, name); + Var handle(buffer->name + "_handle", DataType::Handle()); + frame->args.push_back(handle); + frame->buffer_map.Set(handle, buffer); + return buffer; +} + +void FuncName(String name) { + PrimFuncFrame frame = FindPrimFuncFrame("T.func_name"); + if (frame->name.defined()) { + LOG(FATAL) << "ValueError: Duplicate prim func name, previous one is " << frame->name.value(); + } + frame->name = name; +} + +void FuncAttrs(Map attrs) { + using namespace tvm::tir; + PrimFuncFrame frame = FindPrimFuncFrame("T.func_attr"); + if (frame->attrs.defined()) { + LOG(FATAL) << "ValueError: Duplicate prim func annotations, previous one is " << frame->attrs; + } + frame->attrs = attrs; +} + +tvm::Type FuncRet(tvm::Type ret_type) { + PrimFuncFrame frame = FindPrimFuncFrame("T.ret_type"); + if (frame->ret_type.defined()) { + LOG(FATAL) << "ValueError: Duplicate prim func return type, previous one is " + << frame->ret_type.value(); + } + frame->ret_type = ret_type; + return ret_type; +} + +Buffer MatchBuffer(ObjectRef param, Array shape, DataType dtype, Optional data, + Array strides, PrimExpr elem_offset, String storage_scope, int align, + int offset_factor, String buffer_type_str, Array axis_separators) { + Buffer buffer = BufferDecl(shape, dtype, "", data, strides, elem_offset, storage_scope, align, + offset_factor, buffer_type_str, axis_separators); + if (const auto* var = param.as()) { + PrimFuncFrame frame = FindPrimFuncFrame("T.match_buffer"); + Var v = GetRef(var); + for (auto const& arg : frame->args) { + if (arg.same_as(v)) { + frame->buffer_map.Set(v, buffer); + return buffer; + } + } + LOG(FATAL) << "ValueError: Can not bind non-input param to buffer."; + } else if (const auto* buffer_load = param.as()) { + BlockFrame frame = FindBlockFrame("T.match_buffer"); + frame->match_buffers.push_back(tvm::tir::MatchBufferRegion( + buffer, BufferRegionFromLoad(GetRef(buffer_load)))); + } else if (const auto* buffer_region = param.as()) { + BlockFrame frame = FindBlockFrame("T.match_buffer"); + frame->match_buffers.push_back( + tvm::tir::MatchBufferRegion(buffer, GetRef(buffer_region))); + } else { + LOG(FATAL) << "ValueError: Unexpected type for TIR MatchBuffer."; + } + return buffer; +} + +void PreflattenedBuffer(Buffer postflattened_buffer, Array shape, DataType dtype, + Optional data, Array strides, PrimExpr elem_offset, + String storage_scope, int align, int offset_factor, String buffer_type_str, + Array axis_separators) { + PrimFuncFrame frame = FindPrimFuncFrame("T.preflattened_buffer"); + for (auto const& p : frame->buffer_map) { + if (p.second.same_as(postflattened_buffer)) { + String buffer_name(postflattened_buffer->name + "_preflatten"); + Buffer buffer = + BufferDecl(shape, dtype, buffer_name, data.value_or(p.second->data), strides, elem_offset, + storage_scope, align, offset_factor, buffer_type_str, axis_separators); + details::Namer::Name(buffer, buffer_name); + frame->preflattened_buffer_map.Set(p.first, buffer); + return; + } + } + LOG(FATAL) << "ValueError: postflattened buffer " << postflattened_buffer->name + << " does not exist."; +} + BlockFrame Block(String name, bool no_realize) { ObjectPtr n = make_object(); n->name = name; @@ -58,9 +174,87 @@ BlockFrame Block(String name, bool no_realize) { } void Evaluate(PrimExpr value) { AddToParent(tvm::tir::Evaluate(value)); } + +using tvm::script::ir_builder::details::Namer; + +TVM_STATIC_IR_FUNCTOR(Namer, vtable) + .set_dispatch([](const ObjectRef& node, String name) -> void { + tvm::tir::BufferNode* buffer = + const_cast(node.as()); + buffer->name = name; + Namer::Name(buffer->data, name); + int n = buffer->strides.size(); + for (int i = 0; i < n; ++i) { + PrimExpr e = buffer->strides[i]; + if (const tvm::tir::VarNode* v = e.as()) { + Namer::Name(GetRef(v), name + "_s" + std::to_string(i)); + } + } + }); + +TVM_STATIC_IR_FUNCTOR(Namer, vtable) + .set_dispatch([](const ObjectRef& node, String name) -> void { + using namespace tvm::tir; + SizeVarNode* var = const_cast(node.as()); + var->name_hint = name; + }); + +TVM_STATIC_IR_FUNCTOR(Namer, vtable) + .set_dispatch([](const ObjectRef& node, String name) -> void { + using namespace tvm::tir; + VarNode* var = const_cast(node.as()); + var->name_hint = name; + }); + +TVM_STATIC_IR_FUNCTOR(Namer, vtable) + .set_dispatch([](const ObjectRef& node, String name) -> void { + using namespace tvm::tir; + IterVarNode* var = const_cast(node.as()); + Namer::Name(var->var, name); + }); + +TVM_REGISTER_GLOBAL("script.ir_builder.tir.BufferDecl").set_body_typed(BufferDecl); + TVM_REGISTER_GLOBAL("script.ir_builder.tir.PrimFunc").set_body_typed(PrimFunc); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Arg") + .set_body_typed([](String name, ObjectRef obj) -> ObjectRef { + using namespace tvm::tir; + if (const auto* var = obj.as()) { + return Arg(name, GetRef(var)); + } + if (const auto* buffer = obj.as()) { + return Arg(name, GetRef(buffer)); + } + LOG(FATAL) << "ValueError: Unexpected type for TIR Arg: " << obj->GetTypeKey(); + throw; + }); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.FuncName").set_body_typed(FuncName); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.FuncAttrs").set_body_typed(FuncAttrs); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.FuncRet").set_body_typed(FuncRet); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.MatchBuffer").set_body_typed(MatchBuffer); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.PreflattenedBuffer").set_body_typed(PreflattenedBuffer); + TVM_REGISTER_GLOBAL("script.ir_builder.tir.Block").set_body_typed(Block); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate); + +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int8").set_body_typed(Int8); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int16").set_body_typed(Int16); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32").set_body_typed(Int32); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int64").set_body_typed(Int64); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.UInt8").set_body_typed(UInt8); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.UInt16").set_body_typed(UInt16); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.UInt32").set_body_typed(UInt32); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.UInt64").set_body_typed(UInt64); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Float8").set_body_typed(Float8); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Float16").set_body_typed(Float16); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Float32").set_body_typed(Float32); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Float64").set_body_typed(Float64); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32x4").set_body_typed(Int32x4); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32x8").set_body_typed(Int32x8); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32x16").set_body_typed(Int32x16); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Boolean").set_body_typed(Boolean); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Handle").set_body_typed(Handle); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Void").set_body_typed(Void); } // namespace tir } // namespace ir_builder } // namespace script diff --git a/src/script/ir_builder/tir/utils.h b/src/script/ir_builder/tir/utils.h index 4f8b3f77c6e1..c29fae1c65e9 100644 --- a/src/script/ir_builder/tir/utils.h +++ b/src/script/ir_builder/tir/utils.h @@ -28,6 +28,10 @@ namespace script { namespace ir_builder { namespace tir { +/*! + * \brief Add tir Stmt to the top frame in IRBuilder frame stack. + * \param stmt The Stmt. + */ inline void AddToParent(tvm::tir::Stmt stmt) { IRBuilder builder = IRBuilder::Current(); if (builder->frames.empty()) { @@ -40,6 +44,11 @@ inline void AddToParent(tvm::tir::Stmt stmt) { } } +/*! + * \brief Convert array of tir Stmt to single Stmt. + * \param stmt The array of Stmt. + * \return The SeqStmt. + */ inline tvm::tir::Stmt AsStmt(const Array& stmt) { using namespace tvm::tir; if (stmt.empty()) { @@ -51,6 +60,11 @@ inline tvm::tir::Stmt AsStmt(const Array& stmt) { } } +/*! + * \brief Check whether the top frame in IRBuilder frame stack is PrimFuncFrame. + * \param method The method name to be printed when throwing exception. + * \return The top frame of PrimFuncFrame. + */ inline PrimFuncFrame FindPrimFuncFrame(const String& method) { if (Optional frame = IRBuilder::Current()->GetLastFrame()) { return frame.value(); @@ -60,6 +74,11 @@ inline PrimFuncFrame FindPrimFuncFrame(const String& method) { throw; } +/*! + * \brief Check whether the top frame in IRBuilder frame stack is BlockFrame. + * \param method The method name to be printed when throwing exception. + * \return The top frame of BlockFrame. + */ inline BlockFrame FindBlockFrame(const String& method) { if (Optional frame = IRBuilder::Current()->GetLastFrame()) { return frame.value(); @@ -69,6 +88,19 @@ inline BlockFrame FindBlockFrame(const String& method) { throw; } +/*! + * \brief Convert BufferLoad to BufferRegion. + * \param buffer_load The BufferLoad. + * \return The converted BufferRegion. + */ +inline tvm::tir::BufferRegion BufferRegionFromLoad(tvm::tir::BufferLoad buffer_load) { + Array ranges; + for (const PrimExpr& index : buffer_load->indices) { + ranges.push_back(Range::FromMinExtent(index, IntImm(index->dtype, 1))); + } + return tvm::tir::BufferRegion(buffer_load->buffer, ranges); +} + } // namespace tir } // namespace ir_builder } // namespace script diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py index 85080c7c65fc..5c93e99909d9 100644 --- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py +++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py @@ -25,7 +25,7 @@ from tvm.ir.base import assert_structural_equal -def test_ir_builder_tir_primfunc(): +def test_ir_builder_tir_primfunc_base(): with IRBuilder() as ib: with T.prim_func(): T.evaluate(0) @@ -45,6 +45,48 @@ def test_ir_builder_tir_primfunc(): assert_structural_equal(prim_func_actual, prim_func_expected, map_free_vars=True) +def test_ir_builder_tir_primfunc_complete(): + with IRBuilder() as ib: + with T.prim_func(): + T.arg("a", T.handle()) + T.arg("b", T.var("int64")) + T.arg("c", T.buffer_decl((128, 128), "float32")) + d = T.arg("d", T.handle()) + e = T.arg("e", T.buffer_decl((1024,), "int8")) + T.func_attr({"key": "value"}) + T.func_ret(tvm.ir.PrimType("int64")) + buffer_d = T.match_buffer(d, (64, 64), "int64") + T.preflattened_buffer(e, (32, 32), "int8", data=e.data) + T.evaluate(0) + # the prim_func generated by IRBuilder + prim_func_actual = ib.get() + + # the expected prim_func + c_handle, c_buffer = tir.Var("c_handle", "handle"), tir.decl_buffer( + (128, 128), "float32", name="c" + ) + d_handle, d_buffer = tir.Var("d", "handle"), tir.decl_buffer((64, 64), "int64", name="d") + e_handle, e_buffer = tir.Var("e_handle", "handle"), tir.decl_buffer((1024,), "int8", name="e") + prim_func_expected = tir.PrimFunc( + params=[ + tir.Var("a", "handle"), + tir.Var("b", "int64"), + c_handle, + d_handle, + e_handle, + ], + body=tir.Evaluate(0), + ret_type=tvm.ir.PrimType("int64"), + buffer_map={c_handle: c_buffer, d_handle: d_buffer, e_handle: e_buffer}, + preflattened_buffer_map={ + e_handle: tir.decl_buffer((32, 32), "int8", name="e_preflatten", data=e_buffer.data) + }, + attrs=tvm.ir.make_node("DictAttrs", key="value"), + ) + # Check if the generated ir is expected + assert_structural_equal(prim_func_actual, prim_func_expected, map_free_vars=True) + + def test_ir_builder_tir_block(): with IRBuilder() as ib: with T.block("block"): From 421ff76e3e02e0d97018623fc1a42f202fe202bc Mon Sep 17 00:00:00 2001 From: Ruihang Lai Date: Wed, 14 Sep 2022 17:24:14 -0400 Subject: [PATCH 166/704] [TIR][Meta-Schedule] Tuple-reduction scheduling support (#11639) [TIR][MetaSchedule] Support Tuple Reduction This PR improves our TIR scheduling primitives/transformations (rfactor & cross-thread reduction) designed for reduction operators, so that they can be applied to blocks of tuple-reduction. --- .../schedule_rule/cross_thread_reduction.cc | 7 + src/tir/schedule/analysis.h | 48 +- src/tir/schedule/analysis/analysis.cc | 524 +------------ src/tir/schedule/analysis/reducer.cc | 702 ++++++++++++++++++ src/tir/schedule/primitive/reduction.cc | 402 ++++++---- .../lower_cross_thread_reduction.cc | 323 ++++---- ...meta_schedule_schedule_rule_add_rfactor.py | 166 +++++ ...le_schedule_rule_cross_thread_reduction.py | 99 +++ .../unittest/test_tir_schedule_rfactor.py | 649 +++++++++++++++- ..._transform_lower_cross_thread_reduction.py | 244 +++++- 10 files changed, 2314 insertions(+), 850 deletions(-) create mode 100644 src/tir/schedule/analysis/reducer.cc diff --git a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc index 0f0ab99e7259..35be33f72e21 100644 --- a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc +++ b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc @@ -184,6 +184,13 @@ class CrossThreadReductionNode : public ScheduleRuleNode { */ std::tuple GetComputeTargetLoopAndBlock( const tir::Schedule& sch, const tir::BlockRV& block_rv) { + // Step 0. Due to technical reason of some primitives (e.g., compute-at), if the block is doing + // a tuple reduction, fusion is temporarily not supported. + if (sch->Get(block_rv)->writes.size() != 1) { + return std::make_tuple(false, tir::LoopRV{nullptr}, tir::BlockRV{nullptr}, + tir::LoopRV{nullptr}); + } + // Step 1. Get all the consumers of the input block. Array consumers = sch->GetConsumers(block_rv); diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h index 52ef17df162c..489df8959d1b 100644 --- a/src/tir/schedule/analysis.h +++ b/src/tir/schedule/analysis.h @@ -455,15 +455,14 @@ std::pair, bool> GetBufferDefiningSite(const StmtSRef& block_ /******** Reduction Block Related ********/ /*! - * \brief Convert the `init` and `body` of the input block to BufferStores - * \param self The schedule state - * \param block The block to be analyzed - * \return The BufferStores of the `init` and `body` of the input block - * \throw ScheduleError If the `init` or `body` is not BufferStore, or they don't write to the same - * buffer + * \brief Get the init values and the BufferStore updates from the input reduction block + * \param self The schedule state, used for error reporting + * \param block The block from which the init values and BufferStore updates are extracted from + * \return The extracted init values and BufferStore updates + * \throw ScheduleError If rfactor or cross-thread reduction cannot be applied to the block */ -std::pair GetBufferStoresFromReductionBlock( - const Optional& self, const Block& block); +std::pair, Array> GetInitValuesAndUpdatesFromReductionBlock( + const Optional& self, Block block); /*! * \brief Check whether the input array of IterVars only contains data-parallel and reduction block @@ -484,16 +483,17 @@ bool ContainsOnlyDataParAndReductionBlockIter(const Array& iters); bool ReductionIterNotIndexOutputBuffer(const Block& block); /*! - * \brief Given a reduction identity and a reduction combiner, detect the corresponding commutative - * reducer, and extract the combiner lhs and combiner rhs + * \brief Given a list of reduction identities and a list of reduction combiners, detect the + * corresponding commutative reducer, and extract the combiner LHS values and combiner RHS values * \param self The schedule state - * \param identity The reduction identity to be analyzed - * \param combiner The reduction combiner to be analyzed - * \return The corresponding CommReducer, the combiner lhs and the combiner rhs + * \param identities The reduction identities to be analyzed + * \param combiners The reduction combiners to be analyzed + * \return The corresponding CommReducer, combiner LHS values and combiner RHS values * \throw ScheduleError If no corresponding commutative reducer can be matched */ -std::tuple GetReducerAndCombinerLhsRhs( - const Optional& self, const PrimExpr& identity, const BufferStore& combiner); +std::tuple, Array> GetReducerAndCombinerLhsRhs( + const Optional& self, const Array& identities, + const Array& combiners); /******** Commutative Reducer ********/ @@ -502,20 +502,20 @@ std::tuple GetReducerAndCombinerLhsRhs( * \return The list of the registered reducer-getter functions * \sa ReducerRegistry */ -std::vector> GetReducerGetters(); +std::vector(Array)>> GetReducerGetters(); /*! - * \brief Given the input identity and the combiner BufferStore of a reduction, extract the - * corresponding commutative reducer and its lhs, rhs if possible. - * \param identity The identity of the reduction - * \param combiner The combiner of the reduction + * \brief Given the input identities and the combiner BufferStores of a reduction, extract the + * corresponding commutative reducer, LHS values and RHS values, if possible. + * \param identities The identities of the reduction + * \param combiners The combiners of the reduction * \param result_reducer The extracted CommReducer - * \param lhs The extracted lhs of the reducer - * \param rhs The extracted rhs of the reducer + * \param lhs The extracted LHS values of the reducer + * \param rhs The extracted RHS values of the reducer * \return A boolean indicating whether a corresponding commutative reducer is found */ -bool FromIdentityCombiner(const PrimExpr& identity, const BufferStore& combiner, - CommReducer* result_reducer, PrimExpr* lhs, PrimExpr* rhs); +bool FromIdentityCombiner(const Array& identities, const Array& combiners, + CommReducer* result_reducer, Array* lhs, Array* rhs); /******** Misc ********/ diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc index fb09a3480a3a..7ed60876ab22 100644 --- a/src/tir/schedule/analysis/analysis.cc +++ b/src/tir/schedule/analysis/analysis.cc @@ -16,9 +16,6 @@ * specific language governing permissions and limitations * under the License. */ -#include -#include - #include "../ir_comparator.h" #include "../utils.h" @@ -1237,523 +1234,6 @@ std::pair, bool> GetBufferDefiningSite(const StmtSRef& block_ return {NullOpt, false}; } -/******** Pattern Matcher ********/ - -/*! - * \brief PrimExpr pattern matcher. - * - * It is different from the pattern matcher in arith/pattern_match.h, which is dedicated - * for compile-time constant patterns. This pattern matcher can work on dynamic user-specific - * patterns. - * - * The code below shows how to use the pattern matcher. - * - * \code - * - * Var x("x"), y("y"); - * // use PrimExpr to declare patterns, x, y are holes that can be filled with - * PatternMatcher pattern_matcher(x + y); - * // expr = C[i, j] + A[i, k] * B[k, j], which is the expr we want to match - * pattern_matcher.Match(expr); - * - * if (pattern_matcher.Success()) { - * pattern_matcher.Eval(x) // C[i, j] - * pattern_matcher.Eval(y) // A[i, k] * B[k, j] - * } - * - * \endcode - */ -class PatternMatcher : public ExprVisitor { - public: - explicit PatternMatcher(PrimExpr pattern) : pattern_(std::move(pattern)) {} - - void VisitExpr_(const VarNode* op) final { - auto it = filled_map_.find(op); - if (it == filled_map_.end()) { - filled_map_[op] = expr_to_match_; - } else { - ExprDeepEqual equal; - if (it->second.same_as(expr_to_match_) || equal(it->second, expr_to_match_)) return; - match_success_ = false; - } - } - - void VisitExpr_(const LoadNode* op) final { - const auto* ptr = expr_to_match_.as(); - if (ptr == nullptr) { - match_success_ = false; - } else { - if (!op->buffer_var.same_as(ptr->buffer_var)) { - match_success_ = false; - } else { - PrimExpr tmp = expr_to_match_; - expr_to_match_ = ptr->predicate; - VisitExpr(op->predicate); - expr_to_match_ = ptr->index; - VisitExpr(op->index); - std::swap(expr_to_match_, tmp); - } - } - } - - void VisitExpr_(const LetNode* op) final { - const auto* ptr = expr_to_match_.as(); - if (ptr == nullptr) { - match_success_ = false; - } else { - PrimExpr tmp = expr_to_match_; - expr_to_match_ = ptr->var; - VisitExpr(op->var); - expr_to_match_ = ptr->value; - VisitExpr(op->value); - expr_to_match_ = ptr->body; - VisitExpr(op->body); - std::swap(expr_to_match_, tmp); - } - } - - void VisitExpr_(const CallNode* op) final { - const auto* ptr = expr_to_match_.as(); - if (ptr == nullptr) { - match_success_ = false; - } else { - if (!op->op.same_as(ptr->op)) { - match_success_ = false; - } else { - PrimExpr tmp = expr_to_match_; - for (size_t i = 0; i < op->args.size(); ++i) { - expr_to_match_ = ptr->args[i]; - VisitExpr(op->args[i]); - } - std::swap(expr_to_match_, tmp); - } - } - } - -#define TVM_DECLARE_PATTERN_MATCHER_BIN_OP(OpName) \ - void VisitExpr_(const OpName* op) { \ - const auto* ptr = expr_to_match_.as(); \ - if (ptr == nullptr) { \ - match_success_ = false; \ - } else { \ - PrimExpr current = expr_to_match_; \ - expr_to_match_ = ptr->a; \ - VisitExpr(op->a); \ - expr_to_match_ = ptr->b; \ - VisitExpr(op->b); \ - std::swap(expr_to_match_, current); \ - } \ - } - - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(AddNode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(SubNode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(MulNode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(DivNode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(ModNode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(FloorDivNode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(FloorModNode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(MinNode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(MaxNode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(EQNode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(NENode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(LTNode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(LENode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(GTNode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(GENode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(AndNode); - TVM_DECLARE_PATTERN_MATCHER_BIN_OP(OrNode); - - void VisitExpr_(const CastNode* op) final { - const auto* ptr = expr_to_match_.as(); - if (ptr == nullptr) { - match_success_ = false; - } else { - if (!runtime::TypeEqual(op->dtype, ptr->dtype)) { - match_success_ = false; - } else { - PrimExpr tmp = expr_to_match_; - expr_to_match_ = ptr->value; - VisitExpr(op->value); - std::swap(expr_to_match_, tmp); - } - } - } - - void VisitExpr_(const NotNode* op) final { - const auto* ptr = expr_to_match_.as(); - if (ptr == nullptr) { - match_success_ = false; - } else { - PrimExpr tmp = expr_to_match_; - expr_to_match_ = ptr->a; - VisitExpr(op->a); - std::swap(expr_to_match_, tmp); - } - } - - void VisitExpr_(const SelectNode* op) final { - const auto* ptr = expr_to_match_.as(); - if (ptr == nullptr) { - match_success_ = false; - } else { - PrimExpr tmp = expr_to_match_; - expr_to_match_ = ptr->condition; - VisitExpr(op->condition); - expr_to_match_ = ptr->true_value; - VisitExpr(op->true_value); - expr_to_match_ = ptr->false_value; - VisitExpr(op->false_value); - std::swap(expr_to_match_, tmp); - } - } - - void VisitExpr_(const RampNode* op) final { - const auto* ptr = expr_to_match_.as(); - if (ptr == nullptr) { - match_success_ = false; - } else { - if (op->lanes != ptr->lanes) { - match_success_ = false; - } else { - PrimExpr tmp = expr_to_match_; - expr_to_match_ = ptr->base; - VisitExpr(op->base); - expr_to_match_ = ptr->stride; - VisitExpr(op->stride); - std::swap(expr_to_match_, tmp); - } - } - } - - void VisitExpr_(const BroadcastNode* op) final { - const auto* ptr = expr_to_match_.as(); - if (ptr == nullptr) { - match_success_ = false; - } else { - if (op->lanes != ptr->lanes) { - match_success_ = false; - } else { - PrimExpr tmp = expr_to_match_; - expr_to_match_ = ptr->value; - VisitExpr(op->value); - std::swap(expr_to_match_, tmp); - } - } - } - - void VisitExpr_(const ShuffleNode* op) final { - const auto* ptr = expr_to_match_.as(); - if (ptr == nullptr) { - match_success_ = false; - } else { - if (op->vectors.size() != ptr->vectors.size() || op->indices.size() != ptr->indices.size()) { - match_success_ = false; - } else { - PrimExpr tmp = expr_to_match_; - for (size_t i = 0; i < op->indices.size(); ++i) { - expr_to_match_ = ptr->indices[i]; - VisitExpr(op->indices[i]); - } - for (size_t i = 0; i < op->vectors.size(); ++i) { - expr_to_match_ = ptr->vectors[i]; - VisitExpr(op->vectors[i]); - } - std::swap(expr_to_match_, tmp); - } - } - } - - void VisitExpr_(const IntImmNode* op) final { - const auto* ptr = expr_to_match_.as(); - match_success_ = ptr != nullptr && op->value == ptr->value; - } - - void VisitExpr_(const FloatImmNode* op) final { - const auto* ptr = expr_to_match_.as(); - match_success_ = ptr != nullptr && op->value == ptr->value; - } - - void VisitExpr_(const StringImmNode* op) final { - const auto* ptr = expr_to_match_.as(); - match_success_ = ptr != nullptr && op->value == ptr->value; - } - - void VisitExpr_(const BufferLoadNode* op) final { - const auto* ptr = expr_to_match_.as(); - if (ptr == nullptr) { - match_success_ = false; - } else { - if (!op->buffer.same_as(ptr->buffer) || op->indices.size() != ptr->indices.size()) { - match_success_ = false; - } else { - PrimExpr tmp = expr_to_match_; - for (size_t i = 0; i < op->indices.size(); ++i) { - expr_to_match_ = ptr->indices[i]; - VisitExpr(op->indices[i]); - } - std::swap(expr_to_match_, tmp); - } - } - } - - void Match(const PrimExpr& expr_to_match) { - this->match_success_ = true; - this->filled_map_.clear(); - this->expr_to_match_ = expr_to_match; - this->operator()(pattern_); - } - - PrimExpr Eval(const Var& var) { - auto it = filled_map_.find(var.operator->()); - ICHECK(it != filled_map_.end()) << "Unknown pattern variable"; - ICHECK(match_success_) << "Match failed"; - return it->second; - } - - bool Success() const { return match_success_; } - - private: - bool match_success_{true}; - PrimExpr pattern_, expr_to_match_; - std::unordered_map filled_map_; -}; - -/******** Reduction Block Related ********/ - -class InitBodyNotBufferStoreError : public ScheduleError { - public: - explicit InitBodyNotBufferStoreError(IRModule mod, Block block, bool init_is_bufferstore, - bool body_is_bufferstore) - : mod_(std::move(mod)), - block_(std::move(block)), - init_is_bufferstore_(init_is_bufferstore), - body_is_bufferstore_(body_is_bufferstore) {} - - String FastErrorString() const final { - return "ScheduleError: The `init` and `body` of reduction block are required to be both " - "BufferStore so that rfactor or cross-thread reduction can be applied"; - } - - String DetailRenderTemplate() const final { - if (!init_is_bufferstore_ && !body_is_bufferstore_) { - return "The `init` and `body` of block {0} are required to be BufferStore so that rfactor or " - "cross-thread reduction can be applied"; - } else if (!init_is_bufferstore_) { - return "The `init` of block {0} is required to be BufferStore so that rfactor or cross-thread" - " reduction can be applied"; - } else { - ICHECK(!body_is_bufferstore_); - return "The `body` of block {0} is required to be BufferStore so that rfactor or cross-thread" - " reduction can be applied"; - } - } - - IRModule mod() const final { return mod_; } - Array LocationsOfInterest() const final { return {block_}; } - - IRModule mod_; - Block block_; - bool init_is_bufferstore_; - bool body_is_bufferstore_; -}; - -class InitBodyNotSameBufferAccessError : public ScheduleError { - public: - explicit InitBodyNotSameBufferAccessError(IRModule mod, Block block) - : mod_(std::move(mod)), block_(std::move(block)) {} - - String FastErrorString() const final { - return "ScheduleError: The `init` and `body` of the reduction block are required to have the " - "same buffer access pattern"; - } - - String DetailRenderTemplate() const final { - std::ostringstream os; - const auto* init = block_->init.as(); - const auto* update = block_->body.as(); - os << "The `init` and `body` of the block {0} is required to have the same buffer access " - "pattern. However, in block {0} the `init` writes to " - << init->buffer->name << init->indices << ", and the `body` writes to " - << update->buffer->name << update->indices; - return os.str(); - } - - IRModule mod() const final { return mod_; } - Array LocationsOfInterest() const final { return {block_}; } - - IRModule mod_; - Block block_; -}; - -std::pair GetBufferStoresFromReductionBlock( - const Optional& self, const Block& block) { - static constexpr const char* error_str1 = - "ValueError: The `init` and `body` of the reduction block are required to be both " - "BufferStore so that rfactor or cross-thread reduction can be applied. However, a reduction " - "block that doesn't meet this requirement is "; - static constexpr const char* error_str2 = - "ValueError: The `init` and `body` of the reduction block are required to have the same " - "buffer access pattern so that rfactor or cross-thread reduction can be applied. However, a " - "reduction block that doesn't meet this requirement is "; - - const auto* init = block->init.as(); - const auto* body = block->body.as(); - if (!(init && body)) { - if (self.defined()) { - throw InitBodyNotBufferStoreError(self.value()->mod, block, init != nullptr, body != nullptr); - } else { - LOG(FATAL) << error_str1 << block; - } - } - if (!init->buffer.same_as(body->buffer)) { - if (self.defined()) { - throw InitBodyNotSameBufferAccessError(self.value()->mod, block); - } else { - LOG(FATAL) << error_str2 << block; - } - } - int ndim = static_cast(init->buffer->shape.size()); - for (int i = 0; i < ndim; ++i) { - if (!ExprDeepEqual()(init->indices[i], body->indices[i])) { - if (self.defined()) { - throw InitBodyNotSameBufferAccessError(self.value()->mod, block); - } else { - LOG(FATAL) << error_str2 << block; - } - } - } - return std::make_pair(GetRef(init), GetRef(body)); -} - -bool ContainsOnlyDataParAndReductionBlockIter(const Array& iters) { - for (const IterVar& iter_var : iters) { - if (iter_var->iter_type != kDataPar && iter_var->iter_type != kCommReduce) { - return false; - } - } - return true; -} - -bool ReductionIterNotIndexOutputBuffer(const Block& block) { - // Step 1. Collect the reduction block iters. - std::unordered_set reduction_block_iters; - reduction_block_iters.reserve(block->iter_vars.size()); - for (const IterVar& iter_var : block->iter_vars) { - if (iter_var->iter_type == kCommReduce) { - reduction_block_iters.insert(iter_var->var.get()); - } - } - // Step 2. Check if the reduction block iters are used to index the output buffer. - std::unordered_set buffer_written; - buffer_written.reserve(block->writes.size()); - for (const BufferRegion& write_region : block->writes) { - buffer_written.insert(write_region->buffer.get()); - } - auto f_uses_reduction_block_var = [&](const PrimExpr& expr) -> bool { - return UsesVar(expr, [&](const VarNode* var) { // - return reduction_block_iters.count(var); - }); - }; - bool affected = false; - PreOrderVisit(block->body, [&](const ObjectRef& obj) { - if (affected) { - return false; - } - const auto* store = obj.as(); - if (!store) { - return true; - } - ICHECK(buffer_written.count(store->buffer.get())) - << "ValueError: The buffer \"" << store->buffer - << "\" is written in the block but is not in the block's signature"; - for (const PrimExpr& index : store->indices) { - if (f_uses_reduction_block_var(index)) { - affected = true; - return false; - } - } - return false; - }); - return !affected; -} - -class NoMatchedReducerError : public ScheduleError { - public: - explicit NoMatchedReducerError(IRModule mod, PrimExpr identity, BufferStore combiner) - : mod_(std::move(mod)), identity_(std::move(identity)), combiner_(std::move(combiner)) {} - - String FastErrorString() const final { - return "ScheduleError: No matched reducer for the identity and the combiner of this reduction " - "block. So rfactor and cross-thread reduction cannot be applied."; - } - - String DetailRenderTemplate() const final { - std::ostringstream os; - os << "No matched reducer for identity " << identity_ << " and combiner " << combiner_ - << "In this case rfactor cannot be applied. You can check tvm::tir::ReducerRegistry for " - "default reducers or registering new reducers."; - return os.str(); - } - - IRModule mod() const final { return mod_; } - Array LocationsOfInterest() const final { return {}; } - - IRModule mod_; - PrimExpr identity_; - BufferStore combiner_; -}; - -std::tuple GetReducerAndCombinerLhsRhs( - const Optional& self, const PrimExpr& identity, const BufferStore& combiner) { - CommReducer reducer{nullptr}; - PrimExpr combiner_lhs{nullptr}, combiner_rhs{nullptr}; - bool matched = FromIdentityCombiner(identity, combiner, &reducer, &combiner_lhs, &combiner_rhs); - if (!matched) { - if (self.defined()) { - throw NoMatchedReducerError(self.value()->mod, identity, combiner); - } else { - LOG(FATAL) << "ValueError: No matched reducer for the identity and the combiner of the " - "reduction block. So rfactor and cross-thread reduction cannot be applied."; - } - } - return std::make_tuple(std::move(reducer), std::move(combiner_lhs), std::move(combiner_rhs)); -} - -/******** Commutative Reducer ********/ - -bool MatchReducer(const CommReducer& reducer, const PrimExpr& identity, const PrimExpr& combiner, - const BufferLoad& load, PrimExpr* lhs, PrimExpr* rhs) { - if (!ExprDeepEqual()(reducer->identity_element[0], identity)) { - return false; - } - PatternMatcher pattern_matcher(reducer->result[0]); - pattern_matcher.Match(combiner); - if (pattern_matcher.Success()) { - PrimExpr lhs_tmp = pattern_matcher.Eval(reducer->lhs[0]); - PrimExpr rhs_tmp = pattern_matcher.Eval(reducer->rhs[0]); - if (ExprDeepEqual()(load, lhs_tmp)) { - *lhs = std::move(lhs_tmp); - *rhs = std::move(rhs_tmp); - } - return true; - } - return false; -} - -bool FromIdentityCombiner(const PrimExpr& identity, const BufferStore& combiner, - CommReducer* result_reducer, PrimExpr* lhs, PrimExpr* rhs) { - BufferLoad load(combiner->buffer, combiner->indices); - // Check reduction patterns. - for (const TypedPackedFunc& reducer_getter : GetReducerGetters()) { - CommReducer reducer = reducer_getter(identity.dtype()); - if (MatchReducer(reducer, identity, combiner->value, load, lhs, rhs)) { - *result_reducer = std::move(reducer); - return true; - } - } - return false; -} - /******** SRef Tree Related ********/ StmtSRef GetSRefTreeRoot(const StmtSRef& sref) { @@ -2072,8 +1552,8 @@ bool NeedsRFactorOrCrossThreadReduction(const tir::ScheduleState& self, // const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); Array loops = tir::GetLoops(block_sref); - // Cond 1. The block has only one write buffer - if (block->writes.size() != 1) { + // Cond 1. The block must have at lease one write buffer + if (block->writes.size() == 0) { return false; } diff --git a/src/tir/schedule/analysis/reducer.cc b/src/tir/schedule/analysis/reducer.cc new file mode 100644 index 000000000000..50813ef3cae8 --- /dev/null +++ b/src/tir/schedule/analysis/reducer.cc @@ -0,0 +1,702 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "../utils.h" + +namespace tvm { +namespace tir { + +/******** Pattern Matcher ********/ + +/*! + * \brief PrimExpr pattern matcher. + * + * It is different from the pattern matcher in arith/pattern_match.h, which is dedicated + * for compile-time constant patterns. This pattern matcher can work on dynamic user-specific + * patterns. + * + * The code below shows how to use the pattern matcher. + * + * \code + * + * Var x("x"), y("y"); + * // use PrimExpr to declare patterns, x, y are holes that can be filled with + * PatternMatcher pattern_matcher(x + y); + * // expr = C[i, j] + A[i, k] * B[k, j], which is the expr we want to match + * pattern_matcher.Match(expr); + * + * if (pattern_matcher.Success()) { + * pattern_matcher.Eval(x) // C[i, j] + * pattern_matcher.Eval(y) // A[i, k] * B[k, j] + * } + * + * \endcode + */ +class PatternMatcher : public ExprVisitor { + public: + explicit PatternMatcher(Array pattern) : pattern_(std::move(pattern)) {} + + void VisitExpr_(const VarNode* op) final { + auto it = filled_map_.find(op); + if (it == filled_map_.end()) { + filled_map_[op] = expr_to_match_; + } else { + ExprDeepEqual equal; + if (it->second.same_as(expr_to_match_) || equal(it->second, expr_to_match_)) return; + match_success_ = false; + } + } + + void VisitExpr_(const LoadNode* op) final { + const auto* ptr = expr_to_match_.as(); + if (ptr == nullptr) { + match_success_ = false; + } else { + if (!op->buffer_var.same_as(ptr->buffer_var)) { + match_success_ = false; + } else { + PrimExpr tmp = expr_to_match_; + expr_to_match_ = ptr->predicate; + VisitExpr(op->predicate); + expr_to_match_ = ptr->index; + VisitExpr(op->index); + std::swap(expr_to_match_, tmp); + } + } + } + + void VisitExpr_(const LetNode* op) final { + const auto* ptr = expr_to_match_.as(); + if (ptr == nullptr) { + match_success_ = false; + } else { + PrimExpr tmp = expr_to_match_; + expr_to_match_ = ptr->var; + VisitExpr(op->var); + expr_to_match_ = ptr->value; + VisitExpr(op->value); + expr_to_match_ = ptr->body; + VisitExpr(op->body); + std::swap(expr_to_match_, tmp); + } + } + + void VisitExpr_(const CallNode* op) final { + const auto* ptr = expr_to_match_.as(); + if (ptr == nullptr) { + match_success_ = false; + } else { + if (!op->op.same_as(ptr->op)) { + match_success_ = false; + } else { + PrimExpr tmp = expr_to_match_; + for (size_t i = 0; i < op->args.size(); ++i) { + expr_to_match_ = ptr->args[i]; + VisitExpr(op->args[i]); + } + std::swap(expr_to_match_, tmp); + } + } + } + +#define TVM_DECLARE_PATTERN_MATCHER_BIN_OP(OpName) \ + void VisitExpr_(const OpName* op) { \ + const auto* ptr = expr_to_match_.as(); \ + if (ptr == nullptr) { \ + match_success_ = false; \ + } else { \ + PrimExpr current = expr_to_match_; \ + expr_to_match_ = ptr->a; \ + VisitExpr(op->a); \ + expr_to_match_ = ptr->b; \ + VisitExpr(op->b); \ + std::swap(expr_to_match_, current); \ + } \ + } + + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(AddNode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(SubNode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(MulNode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(DivNode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(ModNode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(FloorDivNode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(FloorModNode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(MinNode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(MaxNode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(EQNode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(NENode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(LTNode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(LENode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(GTNode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(GENode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(AndNode); + TVM_DECLARE_PATTERN_MATCHER_BIN_OP(OrNode); + + void VisitExpr_(const CastNode* op) final { + const auto* ptr = expr_to_match_.as(); + if (ptr == nullptr) { + match_success_ = false; + } else { + if (!runtime::TypeEqual(op->dtype, ptr->dtype)) { + match_success_ = false; + } else { + PrimExpr tmp = expr_to_match_; + expr_to_match_ = ptr->value; + VisitExpr(op->value); + std::swap(expr_to_match_, tmp); + } + } + } + + void VisitExpr_(const NotNode* op) final { + const auto* ptr = expr_to_match_.as(); + if (ptr == nullptr) { + match_success_ = false; + } else { + PrimExpr tmp = expr_to_match_; + expr_to_match_ = ptr->a; + VisitExpr(op->a); + std::swap(expr_to_match_, tmp); + } + } + + void VisitExpr_(const SelectNode* op) final { + const auto* ptr = expr_to_match_.as(); + if (ptr == nullptr) { + match_success_ = false; + } else { + PrimExpr tmp = expr_to_match_; + expr_to_match_ = ptr->condition; + VisitExpr(op->condition); + expr_to_match_ = ptr->true_value; + VisitExpr(op->true_value); + expr_to_match_ = ptr->false_value; + VisitExpr(op->false_value); + std::swap(expr_to_match_, tmp); + } + } + + void VisitExpr_(const RampNode* op) final { + const auto* ptr = expr_to_match_.as(); + if (ptr == nullptr) { + match_success_ = false; + } else { + if (op->lanes != ptr->lanes) { + match_success_ = false; + } else { + PrimExpr tmp = expr_to_match_; + expr_to_match_ = ptr->base; + VisitExpr(op->base); + expr_to_match_ = ptr->stride; + VisitExpr(op->stride); + std::swap(expr_to_match_, tmp); + } + } + } + + void VisitExpr_(const BroadcastNode* op) final { + const auto* ptr = expr_to_match_.as(); + if (ptr == nullptr) { + match_success_ = false; + } else { + if (op->lanes != ptr->lanes) { + match_success_ = false; + } else { + PrimExpr tmp = expr_to_match_; + expr_to_match_ = ptr->value; + VisitExpr(op->value); + std::swap(expr_to_match_, tmp); + } + } + } + + void VisitExpr_(const ShuffleNode* op) final { + const auto* ptr = expr_to_match_.as(); + if (ptr == nullptr) { + match_success_ = false; + } else { + if (op->vectors.size() != ptr->vectors.size() || op->indices.size() != ptr->indices.size()) { + match_success_ = false; + } else { + PrimExpr tmp = expr_to_match_; + for (size_t i = 0; i < op->indices.size(); ++i) { + expr_to_match_ = ptr->indices[i]; + VisitExpr(op->indices[i]); + } + for (size_t i = 0; i < op->vectors.size(); ++i) { + expr_to_match_ = ptr->vectors[i]; + VisitExpr(op->vectors[i]); + } + std::swap(expr_to_match_, tmp); + } + } + } + + void VisitExpr_(const IntImmNode* op) final { + const auto* ptr = expr_to_match_.as(); + match_success_ = ptr != nullptr && op->value == ptr->value; + } + + void VisitExpr_(const FloatImmNode* op) final { + const auto* ptr = expr_to_match_.as(); + match_success_ = ptr != nullptr && op->value == ptr->value; + } + + void VisitExpr_(const StringImmNode* op) final { + const auto* ptr = expr_to_match_.as(); + match_success_ = ptr != nullptr && op->value == ptr->value; + } + + void VisitExpr_(const BufferLoadNode* op) final { + const auto* ptr = expr_to_match_.as(); + if (ptr == nullptr) { + match_success_ = false; + } else { + if (!op->buffer.same_as(ptr->buffer) || op->indices.size() != ptr->indices.size()) { + match_success_ = false; + } else { + PrimExpr tmp = expr_to_match_; + for (size_t i = 0; i < op->indices.size(); ++i) { + expr_to_match_ = ptr->indices[i]; + VisitExpr(op->indices[i]); + } + std::swap(expr_to_match_, tmp); + } + } + } + + void Match(const Array& exprs_to_match) { + this->match_success_ = true; + this->filled_map_.clear(); + + ICHECK_EQ(pattern_.size(), exprs_to_match.size()); + int n_buffers = pattern_.size(); + for (int i = 0; i < n_buffers; ++i) { + this->expr_to_match_ = exprs_to_match[i]; + this->operator()(pattern_[i]); + } + } + + PrimExpr Eval(const Var& var) { + auto it = filled_map_.find(var.operator->()); + ICHECK(it != filled_map_.end()) << "Unknown pattern variable"; + ICHECK(match_success_) << "Match failed"; + return it->second; + } + + bool Success() const { return match_success_; } + + private: + bool match_success_{true}; + Array pattern_; + PrimExpr expr_to_match_; + std::unordered_map filled_map_; +}; + +/******** Reduction Block Related ********/ + +static const char* kRFactorCrossThreadReductionApplicableBlockDef = + R"(Definition of a reduction block that is applicable by RFactor and Cross-Thread Reduction: +1) The block init should be a single BufferStore or a SeqStmt of BufferStores +2) The buffers initialized in the block init should be all different +3) The number of consecutive LetStmts in the block body (if any) should equal the number of BufferStores in the block init +4) The variables of the LetStmts in the block body should be all different +5) The body of the innermost LetStmt should be a single BufferStore or a SeqStmt of BufferStores +6) The number of BufferStores under the block body should equal the number of BufferStores in the block init, and thereby equal the number of LetStmts above +7) The variables bound by the LetStmts in the block body must all directly serve as values of the BufferStores inside, and the stored values of the BufferStores can only be those variables +8) The variables stored by the BufferStores in the block body should be all different +9) The buffers written by the BufferStores in the block body should be all different +10) The buffers initialized in the block init and written in the block body should match +11) The buffers written by the block should have same shape +12) The indices of all BufferStores in the reduction block should be the same)"; + +void ErrorRFactorCrossThreadReductionNotApplicable(const Optional& self, Block block, + int violated_cond) { + class RFactorNotApplicableError : public ScheduleError { + public: + explicit RFactorNotApplicableError(IRModule mod, Block block, int violated_cond) + : mod_(std::move(mod)), block_(std::move(block)), violated_cond_(violated_cond) {} + + String FastErrorString() const final { + return "ScheduleError: RFactor cannot be applied to the block since the block does not meet " + "the requirements"; + } + + String DetailRenderTemplate() const final { + std::ostringstream os; + os << "RFactor cannot be applied to block {0}, because the block violates condition #" + << violated_cond_ << ".\n" + << kRFactorCrossThreadReductionApplicableBlockDef; + return os.str(); + } + + IRModule mod() const final { return mod_; } + Array LocationsOfInterest() const final { return {block_}; } + + IRModule mod_; + Block block_; + int violated_cond_; + }; + + if (self.defined()) { + throw RFactorNotApplicableError(self.value()->mod, std::move(block), violated_cond); + } else { + LOG(FATAL) << "ValueError: Cross-thread reduction cannot be applied to the block " + << block->name_hint << " because the block violates the condition #" << violated_cond + << ".\n" + << kRFactorCrossThreadReductionApplicableBlockDef; + } +} + +/*! + * \brief Extract the BufferStores, which serve as the reduction updates, from the given LetStmt and + * the BufferStores inside. And meanwhile set the buffer order of the reduction + * \param self The schedule state, used for error reporting + * \param block The reduction block, used for error reporting + * \param let The LetStmt from which the reduction updates are extracted + * \param n_buffers The number of buffers participating in the reduction + * \param updates The extracted reduction updates + * \param buf2index A mapping from reduction buffers to their indices of the reduction order + * \throw ScheduleError If rfactor or cross-thread reduction cannot be applied to the block + */ +void ExtractReductionUpdates(const Optional& self, Block block, + const LetStmtNode* let, int n_buffers, Array* updates, + std::unordered_map* buf2index) { + std::unordered_map var2index; + Array let_values; + let_values.reserve(n_buffers); + updates->resize(n_buffers); + + // Step 1. + // - Extract the BufferStore values from the LetStmts. + // - Construct the mapping from let variables to the index. + for (int i = 0; i < n_buffers; ++i) { + if (let == nullptr) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/3); + } + + let_values.push_back(let->value); + auto insert_result = var2index.insert(std::make_pair(let->var.get(), i)); + if (!insert_result.second) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/4); + } + if (i != n_buffers - 1) { + let = let->body.as(); + } + } + + // There should be no more LetStmt. + if (let->body->IsInstance()) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/3); + } + + // Now `let` is expected to be the innermost LetStmt, whose body should either be a SeqStmt or + // a BufferStore + const auto* p_seq = let->body.as(); + const auto* p_buf_store = let->body.as(); + if (p_seq == nullptr && p_buf_store == nullptr) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/5); + } + SeqStmt seq = + p_seq != nullptr ? GetRef(p_seq) : SeqStmt({GetRef(p_buf_store)}); + if (static_cast(seq->seq.size()) != n_buffers) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/6); + } + + // Step 2. + // - Create BufferStores according to the variables being stored. + // - Construct the mapping from reduction buffers to the index. + for (const Stmt& stmt : seq->seq) { + const auto* buf_store = stmt.as(); + if (buf_store == nullptr) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/5); + } + const auto* var = buf_store->value.as(); + if (var == nullptr) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/7); + } + auto it = var2index.find(var); + if (it == var2index.end()) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/7); + } + int idx = it->second; + if ((*updates)[idx].defined()) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/8); + } + updates->Set(idx, BufferStore(buf_store->buffer, let_values[idx], buf_store->indices)); + auto insert_result = buf2index->insert(std::make_pair(buf_store->buffer.get(), idx)); + if (!insert_result.second) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/9); + } + } + for (int i = 0; i < n_buffers; ++i) { + ICHECK((*updates)[i].defined()); + } +} + +std::pair, Array> GetInitValuesAndUpdatesFromReductionBlock( + const Optional& self, Block block) { + Array inits; + Array updates; + + // Step 1. Extract the BufferStores serving as block inits. + if (const auto* init = block->init.as()) { + inits.push_back(GetRef(init)); + } else if (const auto* seq_init = block->init.as()) { + std::unordered_set init_buffers; + for (const Stmt& stmt : seq_init->seq) { + init = stmt.as(); + if (init == nullptr) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/1); + } + auto insert_result = init_buffers.insert(init->buffer.get()); + if (!insert_result.second) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/2); + } + inits.push_back(GetRef(init)); + } + } else { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/1); + } + + // Step 2. Extract the block updates, in the form of BufferStores. + int n_buffers = inits.size(); + std::unordered_map buf2index; + if (const auto* update = block->body.as()) { + updates.push_back(GetRef(update)); + buf2index[update->buffer.get()] = 0; + } else { + const auto* let = block->body.as(); + ExtractReductionUpdates(self, block, let, n_buffers, &updates, &buf2index); + } + ICHECK_EQ(updates.size(), n_buffers); + + // Step 3. Set the init values according to the buffer order in `updates`, with the help of the + // mapping `buf2index`. + Array init_values; + init_values.resize(n_buffers); + + // - Check all buffers have the same shape + // - Check all indices of the BufferStores are the same + // - Check buffers written in the block init and the block body can match + // - Check buffers do not duplicate + const Array& expected_shape = updates[0]->buffer->shape; + const Array& expected_indices = updates[0]->indices; + ICHECK_EQ(expected_shape.size(), expected_indices.size()); + int n_dim = expected_indices.size(); + arith::Analyzer ana; + for (int i = 0; i < n_buffers; ++i) { + if (static_cast(updates[i]->buffer->shape.size()) != n_dim) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/11); + } + if (static_cast(inits[i]->indices.size()) != n_dim || + static_cast(updates[i]->indices.size()) != n_dim) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/12); + } + for (int d = 0; d < n_dim; ++d) { + if (!ana.CanProveEqual(updates[i]->buffer->shape[d], expected_shape[d])) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/11); + } + if (!ana.CanProveEqual(inits[i]->indices[d], expected_indices[d]) || + !ana.CanProveEqual(updates[i]->indices[d], expected_indices[d])) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/12); + } + } + + auto it = buf2index.find(inits[i]->buffer.get()); + if (it == buf2index.end()) { + ErrorRFactorCrossThreadReductionNotApplicable(self, std::move(block), /*violated_cond=*/10); + } + int idx = it->second; + ICHECK(updates[idx]->buffer.same_as(inits[i]->buffer)); + ICHECK(!init_values[idx].defined()); + init_values.Set(idx, inits[i]->value); + } + for (int i = 0; i < n_buffers; ++i) { + ICHECK(init_values[i].defined()); + } + + return std::make_pair(init_values, updates); +} + +bool ContainsOnlyDataParAndReductionBlockIter(const Array& iters) { + for (const IterVar& iter_var : iters) { + if (iter_var->iter_type != kDataPar && iter_var->iter_type != kCommReduce) { + return false; + } + } + return true; +} + +bool ReductionIterNotIndexOutputBuffer(const Block& block) { + // Step 1. Collect the reduction block iters. + std::unordered_set reduction_block_iters; + reduction_block_iters.reserve(block->iter_vars.size()); + for (const IterVar& iter_var : block->iter_vars) { + if (iter_var->iter_type == kCommReduce) { + reduction_block_iters.insert(iter_var->var.get()); + } + } + // Step 2. Check if the reduction block iters are used to index the output buffer. + std::unordered_set buffer_written; + buffer_written.reserve(block->writes.size()); + for (const BufferRegion& write_region : block->writes) { + buffer_written.insert(write_region->buffer.get()); + } + auto f_uses_reduction_block_var = [&](const PrimExpr& expr) -> bool { + return UsesVar(expr, [&](const VarNode* var) { // + return reduction_block_iters.count(var); + }); + }; + bool affected = false; + PreOrderVisit(block->body, [&](const ObjectRef& obj) { + if (affected) { + return false; + } + const auto* store = obj.as(); + if (!store) { + return true; + } + ICHECK(buffer_written.count(store->buffer.get())) + << "ValueError: The buffer \"" << store->buffer + << "\" is written in the block but is not in the block's signature"; + for (const PrimExpr& index : store->indices) { + if (f_uses_reduction_block_var(index)) { + affected = true; + return false; + } + } + return false; + }); + return !affected; +} + +class NoMatchedReducerError : public ScheduleError { + public: + explicit NoMatchedReducerError(IRModule mod, Array identities, + Array combiners) + : mod_(std::move(mod)), + identities_(std::move(identities)), + combiners_(std::move(combiners)) {} + + String FastErrorString() const final { + return "ScheduleError: No matched reducer for the identity and the combiner of this reduction " + "block. So rfactor and cross-thread reduction cannot be applied."; + } + + String DetailRenderTemplate() const final { + std::ostringstream os; + os << "No matched reducer for identity " << identities_ << " and combiner " << combiners_ + << "In this case rfactor cannot be applied. You can check tvm::tir::ReducerRegistry for " + "default reducers or registering new reducers."; + return os.str(); + } + + IRModule mod() const final { return mod_; } + Array LocationsOfInterest() const final { return {}; } + + IRModule mod_; + Array identities_; + Array combiners_; +}; + +std::tuple, Array> GetReducerAndCombinerLhsRhs( + const Optional& self, const Array& identities, + const Array& combiners) { + CommReducer reducer{nullptr}; + Array combiner_lhs, combiner_rhs; + bool matched = + FromIdentityCombiner(identities, combiners, &reducer, &combiner_lhs, &combiner_rhs); + if (!matched) { + if (self.defined()) { + throw NoMatchedReducerError(self.value()->mod, identities, combiners); + } else { + LOG(FATAL) << "ValueError: No matched reducer for the identity and the combiner of the " + "reduction block. So rfactor and cross-thread reduction cannot be applied."; + } + } + return std::make_tuple(std::move(reducer), std::move(combiner_lhs), std::move(combiner_rhs)); +} + +/******** Commutative Reducer ********/ + +bool MatchReducer(const CommReducer& reducer, const Array& identities, + const Array& combined_values, const Array& buf_loads, + Array* lhs, Array* rhs) { + ExprDeepEqual equal; + ICHECK_EQ(identities.size(), combined_values.size()); + int n_buffers = identities.size(); + for (int i = 0; i < n_buffers; ++i) { + if (!equal(reducer->identity_element[i], identities[i])) { + return false; + } + } + + PatternMatcher pattern_matcher(reducer->result); + pattern_matcher.Match(combined_values); + Array lhs_tmp, rhs_tmp; + lhs_tmp.reserve(n_buffers); + rhs_tmp.reserve(n_buffers); + if (!pattern_matcher.Success()) { + return false; + } + + for (int i = 0; i < n_buffers; ++i) { + PrimExpr l = pattern_matcher.Eval(reducer->lhs[i]); + PrimExpr r = pattern_matcher.Eval(reducer->rhs[i]); + if (!equal(buf_loads[i], l)) { + return false; + } + lhs_tmp.push_back(l); + rhs_tmp.push_back(r); + } + *lhs = std::move(lhs_tmp); + *rhs = std::move(rhs_tmp); + return true; +} + +bool FromIdentityCombiner(const Array& identities, const Array& combiners, + CommReducer* result_reducer, Array* lhs, Array* rhs) { + int n = identities.size(); + Array buf_loads; + Array stored_values; + buf_loads.reserve(n); + stored_values.reserve(n); + + for (int i = 0; i < n; ++i) { + buf_loads.push_back(BufferLoad(combiners[i]->buffer, combiners[i]->indices)); + stored_values.push_back(combiners[i]->value); + } + + // Check reduction patterns. + for (const TypedPackedFunc(Array)>& reducer_getter : + GetReducerGetters()) { + Optional reducer = reducer_getter(identities); + if (!reducer.defined()) { + continue; + } + if (MatchReducer(reducer.value(), identities, stored_values, buf_loads, lhs, rhs)) { + *result_reducer = reducer.value(); + return true; + } + } + return false; +} + +} // namespace tir +} // namespace tvm diff --git a/src/tir/schedule/primitive/reduction.cc b/src/tir/schedule/primitive/reduction.cc index 1198e67d710a..2dc47fa15bea 100644 --- a/src/tir/schedule/primitive/reduction.cc +++ b/src/tir/schedule/primitive/reduction.cc @@ -297,29 +297,85 @@ StmtSRef DecomposeReduction(ScheduleState self, const StmtSRef& block_sref, */ struct ReducerRegistry { ReducerRegistry() - : reducer_getters{CreateReducerGetter([](const Var& x, const Var& y) { return x + y; }, - [](DataType dtype) { return make_const(dtype, 0); }), - CreateReducerGetter([](const Var& x, const Var& y) { return x * y; }, - [](DataType dtype) { return make_const(dtype, 1); }), - CreateReducerGetter([](const Var& x, const Var& y) { return min(x, y); }, - [](DataType dtype) { return max_value(dtype); }), - CreateReducerGetter([](const Var& x, const Var& y) { return max(x, y); }, - [](DataType dtype) { return min_value(dtype); })} {} - - static void RegisterReducer(TypedPackedFunc combiner_getter, - TypedPackedFunc identity_getter) { + : reducer_getters{CreateReducerGetter( + /*n_buffers=*/1, + [](const Array& x, const Array& y) { + return Array{x[0] + y[0]}; + }, + [](const Array& values) { + return Array{make_const(values[0]->dtype, 0)}; + }), + CreateReducerGetter( + /*n_buffers=*/1, + [](const Array& x, const Array& y) { + return Array{x[0] * y[0]}; + }, + [](const Array& values) { + return Array{make_const(values[0]->dtype, 1)}; + }), + CreateReducerGetter( + /*n_buffers=*/1, + [](const Array& x, const Array& y) { + return Array{min(x[0], y[0])}; + }, + [](const Array& values) { + return Array{max_value(values[0]->dtype)}; + }), + CreateReducerGetter( + /*n_buffers=*/1, + [](const Array& x, const Array& y) { + return Array{max(x[0], y[0])}; + }, + [](const Array& values) { + return Array{min_value(values[0]->dtype)}; + }), + CreateReducerGetter( + /*n_buffers=*/2, + [](const Array& x, const Array& y) { + PrimExpr idx = Select(x[1] >= y[1], x[0], y[0]); + PrimExpr val = Select(x[1] >= y[1], x[1], y[1]); + return Array{idx, val}; + }, + [](const Array& values) { + return Array{make_const(values[0]->dtype, -1), + min_value(values[1]->dtype)}; + }), + CreateReducerGetter( + /*n_buffers=*/2, + [](const Array& x, const Array& y) { + PrimExpr idx = Select(x[1] <= y[1], x[0], y[0]); + PrimExpr val = Select(x[1] <= y[1], x[1], y[1]); + return Array{idx, val}; + }, + [](const Array& values) { + return Array{make_const(values[0]->dtype, -1), + max_value(values[1]->dtype)}; + })} {} + + static void RegisterReducer( + int n_buffers, TypedPackedFunc(Array, Array)> combiner_getter, + TypedPackedFunc(Array)> identity_getter) { ReducerRegistry::Global()->reducer_getters.push_back(ReducerRegistry::CreateReducerGetter( - std::move(combiner_getter), std::move(identity_getter))); + n_buffers, std::move(combiner_getter), std::move(identity_getter))); } - static TypedPackedFunc CreateReducerGetter( - TypedPackedFunc combiner_getter, - TypedPackedFunc identity_getter) { - return [combiner_getter = std::move(combiner_getter), - identity_getter = std::move(identity_getter)](DataType dtype) -> CommReducer { - Var lhs("x", dtype); - Var rhs("y", dtype); - return CommReducer({lhs}, {rhs}, {combiner_getter(lhs, rhs)}, {identity_getter(dtype)}); + static TypedPackedFunc(Array)> CreateReducerGetter( + int n_buffers, TypedPackedFunc(Array, Array)> combiner_getter, + TypedPackedFunc(Array)> identity_getter) { + return [n_buffers, // + combiner_getter = std::move(combiner_getter), // + identity_getter = std::move(identity_getter) // + ](Array values) -> Optional { + if (static_cast(values.size()) != n_buffers) { + return NullOpt; + } + Array lhs; + Array rhs; + for (int i = 0; i < n_buffers; ++i) { + lhs.push_back(Var("x" + std::to_string(i), values[i]->dtype)); + rhs.push_back(Var("y" + std::to_string(i), values[i]->dtype)); + } + return CommReducer(lhs, rhs, combiner_getter(lhs, rhs), identity_getter(values)); }; } @@ -328,10 +384,10 @@ struct ReducerRegistry { return &instance; } - std::vector> reducer_getters; + std::vector(Array)>> reducer_getters; }; -std::vector> GetReducerGetters() { +std::vector(Array)>> GetReducerGetters() { return ReducerRegistry::Global()->reducer_getters; } @@ -508,44 +564,57 @@ std::unordered_map GetLoopVar2LoopMap(const Array& loo } /*! - * \brief Create the intermediate rfactor buffer, which the rfactor block writes to and the + * \brief Create the intermediate rfactor buffers, which the rfactor block writes to and the * write-back block reads from - * \param buffer The buffer written by the reduction block + * \param buf_stores The BufferStores of the original block, where the rfactor buffers will be + * created from * \param factor_axis The `factor_axis` parameter of rfactor * \param rf_loop The rfactor loop * \return The new created intermediate rfactor buffer */ -Buffer CreateRFactorBuffer(const Buffer& buffer, int factor_axis, const ForNode* rf_loop) { - Array rf_shape = buffer->shape; - rf_shape.insert(rf_shape.begin() + factor_axis, rf_loop->extent); - - ObjectPtr n = make_object(*buffer.get()); - n->shape = rf_shape; - n->name = buffer->name + ".rf"; - n->data = buffer->data.copy_with_suffix(".rf"); - return Buffer(n); +Array CreateRFactorBuffers(const Array& buf_stores, int factor_axis, + const ForNode* rf_loop) { + Array rf_buffers; + rf_buffers.reserve(buf_stores.size()); + for (const BufferStore& buf_store : buf_stores) { + Buffer buffer = buf_store->buffer; + Array rf_shape = buffer->shape; + rf_shape.insert(rf_shape.begin() + factor_axis, rf_loop->extent); + + ObjectPtr n = make_object(*buffer.get()); + n->shape = rf_shape; + n->name = buffer->name + ".rf"; + n->data = buffer->data.copy_with_suffix(".rf"); + rf_buffers.push_back(Buffer(n)); + } + return rf_buffers; } /*! * \brief The base class of the rfactor/write-back block creator, which creates the blocks in four * steps: * 1) Create the new block iters and the their iter bindings - * 2) Create the reduction update of the new block + * 2) Create the body and init of the new block * 3) Create the read/write regions of the new block * 4) Create the new block and the new block-realize */ class BaseBlockCreator { public: explicit BaseBlockCreator(BlockRealize old_block_realize, For rf_loop, - BufferStore old_reduction_update, CommReducer reducer, Buffer rf_buffer, - bool is_rf_block) + Array old_reduction_updates, CommReducer reducer, + Array rf_buffers, bool is_rf_block) : old_block_realize_(std::move(old_block_realize)), rf_loop_(std::move(rf_loop)), - old_reduction_update_(std::move(old_reduction_update)), + old_reduction_updates_(std::move(old_reduction_updates)), reducer_(std::move(reducer)), - rf_buffer_(std::move(rf_buffer)), + rf_buffers_(std::move(rf_buffers)), + n_buffers_(static_cast(rf_buffers_.size())), is_rf_block_(is_rf_block) { n_block_iters_ = static_cast(old_block_realize_->iter_values.size()); + update_buffers_.reserve(n_buffers_); + update_indices_.reserve(n_buffers_); + update_lhs_.reserve(n_buffers_); + update_rhs_.reserve(n_buffers_); } void CreateBlock() { @@ -560,7 +629,15 @@ class BaseBlockCreator { break; } } - CreateReductionUpdate(has_reduce_iter); + + // The pre-processing finds out the buffers written in the block, the indices of the buffer + // accesses, and the reduction LHS and RHS of the stored values. + PreProcess(); + Stmt block_body = Substitute(CreateBlockBody(has_reduce_iter), var_map_); + Optional block_init = CreateBlockInit(has_reduce_iter); + if (block_init.defined()) { + block_init = Substitute(block_init.value(), var_map_); + } CreateReadWriteRegions(); String new_block_name = old_block_realize_->block->name_hint; @@ -569,17 +646,13 @@ class BaseBlockCreator { new_block_name = new_block_name + "_rf"; predicate = old_block_realize_->predicate; } - Optional init_block = - has_reduce_iter ? BufferStore(new_reduction_update_->buffer, reducer_->identity_element[0], - new_reduction_update_->indices) - : Optional(NullOpt); new_block_ = Block( /*iter_vars=*/iter_vars_, /*reads=*/read_regions_, /*writes=*/write_regions_, /*name_hint=*/new_block_name, - /*body=*/new_reduction_update_, - /*init=*/init_block, + /*body=*/std::move(block_body), + /*init=*/std::move(block_init), /*alloc_buffers=*/{}, /*match_buffers=*/{}, /*annotations=*/old_block_realize_->block->annotations); @@ -589,9 +662,58 @@ class BaseBlockCreator { private: virtual void CreateAdditionalIter() = 0; virtual void CreateNormalIters(int idx) = 0; - virtual void CreateReductionUpdate(bool has_reduce_iter) = 0; + virtual void PreProcess() = 0; virtual void CreateReadWriteRegions() = 0; + Stmt CreateBlockBody(bool has_reduce_iter) { + Array buf_stores; + buf_stores.reserve(n_buffers_); + + // Case 1. If the block has no reduction iterator, we just store the RHS values into the + // buffers. + if (!has_reduce_iter) { + for (int i = 0; i < n_buffers_; ++i) { + buf_stores.push_back(BufferStore(update_buffers_[i], update_rhs_[i], update_indices_[i])); + } + return n_buffers_ > 1 ? SeqStmt(buf_stores) : buf_stores[0]; + } + + // Case 2. If the reduction is for single buffer, the block body is a single BufferStore. + Array stored_values = (*reducer_.get())(update_lhs_, update_rhs_); + if (n_buffers_ == 1) { + return BufferStore(update_buffers_[0], stored_values[0], update_indices_[0]); + } + + // Case 3. In case the reduction is for multiple buffers, we should create the reduction with + // LetStmt so that the reduction execution generates correct results. + Array let_vars; + let_vars.reserve(n_buffers_); + for (int i = 0; i < n_buffers_; ++i) { + Var var("v_" + update_buffers_[i]->name, PrimType(stored_values[i]->dtype)); + let_vars.push_back(var); + buf_stores.push_back(BufferStore(update_buffers_[i], var, update_indices_[i])); + } + Stmt body = SeqStmt(buf_stores); + for (int i = n_buffers_ - 1; i >= 0; --i) { + body = LetStmt(let_vars[i], stored_values[i], std::move(body)); + } + return body; + } + + Optional CreateBlockInit(bool has_reduce_iter) { + if (!has_reduce_iter) { + return NullOpt; + } + + Array inits; + inits.reserve(n_buffers_); + for (int i = 0; i < n_buffers_; ++i) { + inits.push_back( + BufferStore(update_buffers_[i], reducer_->identity_element[i], update_indices_[i])); + } + return n_buffers_ > 1 ? SeqStmt(inits) : inits[0]; + } + public: /*! \brief The new created block */ Block new_block_; @@ -607,12 +729,19 @@ class BaseBlockCreator { int n_block_iters_; /*! \brief The rfactor loop */ For rf_loop_; - /*! \brief The update BufferStore of the old block */ - BufferStore old_reduction_update_; + /*! \brief The update BufferStores of the old block */ + Array old_reduction_updates_; /*! \brief The matched commutative reducer */ CommReducer reducer_; - /*! \brief The intermediate rfactor buffer */ - Buffer rf_buffer_; + /*! \brief The intermediate rfactor buffers */ + Array rf_buffers_; + /*! \brief The number of rfactor buffers. */ + const int n_buffers_; + /*! + * \brief A mapping which maps old block iters to new expressions. The old iters will be replaced + * by the expressions in future substitution for the two blocks + */ + Map var_map_; /*! \brief Whether we are creating the rfactor block or the write-back block */ bool is_rf_block_; @@ -620,13 +749,14 @@ class BaseBlockCreator { std::vector iter_vars_; /*! \brief The new block iter bindings of the new created block-realize */ std::vector iter_values_; - /*! - * \brief A mapping which maps old block iters to new expressions. The old iters will be replaced - * by the expressions in future substitution for the two blocks - */ - Map var_map_; - /*! \brief The update BufferStore of the new created block */ - BufferStore new_reduction_update_; + /*! \brief The buffers updated in this block */ + Array update_buffers_; + /*! \brief The indices of the buffers updated in this block, respectively */ + Array> update_indices_; + /*! \brief The LHS values of the reduction in this block */ + Array update_lhs_; + /*! \brief THe RHS values of the reduction in this block */ + Array update_rhs_; /*! \brief The read regions of the new created block */ Array read_regions_; /*! \brief The write regions of the new created block */ @@ -658,13 +788,13 @@ class BaseBlockCreator { class RFactorBlockCreator : public BaseBlockCreator { public: explicit RFactorBlockCreator(BlockRealize old_block_realize, For rf_loop, - BufferStore old_reduction_update, CommReducer reducer, - Buffer rf_buffer, + Array old_reduction_updates, CommReducer reducer, + Array rf_buffers, std::unordered_map loop_vars2loop, - int factor_axis, PrimExpr combiner_rhs) + int factor_axis, Array combiner_rhs) : BaseBlockCreator(std::move(old_block_realize), std::move(rf_loop), - std::move(old_reduction_update), std::move(reducer), std::move(rf_buffer), - true), + std::move(old_reduction_updates), std::move(reducer), + std::move(rf_buffers), true), loop_vars2loop_(std::move(loop_vars2loop)), factor_axis_(factor_axis), combiner_rhs_(std::move(combiner_rhs)) {} @@ -718,41 +848,38 @@ class RFactorBlockCreator : public BaseBlockCreator { var_map_.Set(old_iter->var, Substitute(old_binding, loop_var2block_binding_)); } - void CreateReductionUpdate(bool has_reduce_iter) final { - rf_buf_access_indices_ = old_reduction_update_->indices; + void PreProcess() final { + // The accessed indices for all reduction buffers are the same. + rf_buf_access_indices_ = old_reduction_updates_[0]->indices; rf_buf_access_indices_.insert(rf_buf_access_indices_.begin() + factor_axis_, additional_iter_->var); - PrimExpr rhs{nullptr}; - if (has_reduce_iter) { - rhs = (*reducer_.get())({BufferLoad(rf_buffer_, rf_buf_access_indices_)}, {combiner_rhs_})[0]; - } else { - rhs = combiner_rhs_; + for (int i = 0; i < n_buffers_; ++i) { + update_buffers_.push_back(rf_buffers_[i]); + update_indices_.push_back(rf_buf_access_indices_); + update_lhs_.push_back(BufferLoad(update_buffers_[i], rf_buf_access_indices_)); + update_rhs_.push_back(combiner_rhs_[i]); } - new_reduction_update_ = BufferStore(rf_buffer_, rhs, rf_buf_access_indices_); - new_reduction_update_ = Downcast(Substitute(new_reduction_update_, var_map_)); } void CreateReadWriteRegions() final { + Map buffer_map; + for (int i = 0; i < n_buffers_; ++i) { + buffer_map.Set(old_reduction_updates_[i]->buffer, rf_buffers_[i]); + } const Block& old_block = old_block_realize_->block; - read_regions_ = CreateRegions(old_block->reads); - write_regions_ = CreateRegions(old_block->writes); - } - - Array CreateRegions(const Array& old_regions) { - Array new_regions; - new_regions.reserve(old_regions.size()); - for (const BufferRegion& buffer_region : old_regions) { - if (buffer_region->buffer.same_as(old_reduction_update_->buffer)) { - Array region = buffer_region->region; - region.insert(region.begin() + factor_axis_, - Range::FromMinExtent(additional_iter_->var, 1)); - new_regions.push_back(BufferRegion(rf_buffer_, Substitute(region, var_map_))); - } else { - new_regions.push_back( - BufferRegion(buffer_region->buffer, Substitute(buffer_region->region, var_map_))); - } + read_regions_.reserve(old_block->reads.size()); + for (const BufferRegion& read_region : old_block->reads) { + read_regions_.push_back( + BufferRegion(read_region->buffer, Substitute(read_region->region, var_map_))); + } + write_regions_.reserve(old_block->writes.size()); + for (const BufferRegion& write_region : old_block->writes) { + Array region = write_region->region; + region.insert(region.begin() + factor_axis_, Range::FromMinExtent(additional_iter_->var, 1)); + Optional rf_buffer = buffer_map.Get(write_region->buffer); + ICHECK(rf_buffer.defined()); + write_regions_.push_back(BufferRegion(rf_buffer.value(), Substitute(region, var_map_))); } - return new_regions; } public: @@ -767,8 +894,8 @@ class RFactorBlockCreator : public BaseBlockCreator { std::unordered_map loop_vars2loop_; /*! \brief The factor_axis specified for rfactor */ int factor_axis_; - /*! \brief The rhs of the combiner in the reduction update of the old block */ - PrimExpr combiner_rhs_; + /*! \brief The RHS values of the reduction in the old block */ + Array combiner_rhs_; /*! * \brief A mapping which maps loop vars to new created block iters. This map is used to * substitute the loop vars which appear in the bindings of some old block iters with the new @@ -784,12 +911,13 @@ class RFactorBlockCreator : public BaseBlockCreator { class WriteBackBlockCreator : public BaseBlockCreator { public: explicit WriteBackBlockCreator(BlockRealize old_block_realize, For rf_loop, - BufferStore old_reduction_update, CommReducer reducer, - Buffer rf_buffer, IterVar rf_additional_iter, - PrimExpr combiner_lhs, Array rf_buf_access_indices) + Array old_reduction_updates, CommReducer reducer, + Array rf_buffers, IterVar rf_additional_iter, + Array combiner_lhs, + Array rf_buf_access_indices) : BaseBlockCreator(std::move(old_block_realize), std::move(rf_loop), - std::move(old_reduction_update), std::move(reducer), std::move(rf_buffer), - false), + std::move(old_reduction_updates), std::move(reducer), + std::move(rf_buffers), false), rf_additional_iter_(std::move(rf_additional_iter)), combiner_lhs_(std::move(combiner_lhs)) { iter_vars_.reserve(n_block_iters_); @@ -817,39 +945,40 @@ class WriteBackBlockCreator : public BaseBlockCreator { } } - void CreateReductionUpdate(bool has_reduce_iter) final { - wb_lhs_ = Downcast(Substitute(combiner_lhs_, var_map_)); - wb_rhs_ = - Downcast(Substitute(BufferLoad(rf_buffer_, rf_buf_access_indices_), var_map_)); - new_reduction_update_ = - BufferStore(old_reduction_update_->buffer, (*reducer_.get())({wb_lhs_}, {wb_rhs_})[0], - old_reduction_update_->indices); - new_reduction_update_ = Downcast(Substitute(new_reduction_update_, var_map_)); + void PreProcess() final { + for (int i = 0; i < n_buffers_; ++i) { + PrimExpr rhs = BufferLoad(rf_buffers_[i], rf_buf_access_indices_); + update_buffers_.push_back(old_reduction_updates_[i]->buffer); + update_indices_.push_back(old_reduction_updates_[i]->indices); + update_lhs_.push_back(Substitute(combiner_lhs_[i], var_map_)); + update_rhs_.push_back(Substitute(std::move(rhs), var_map_)); + } } void CreateReadWriteRegions() final { - read_regions_.push_back(CreateRegion(wb_rhs_)); - write_regions_.push_back(CreateRegion(wb_lhs_)); + CreateRegion(update_rhs_, true); + CreateRegion(update_lhs_, false); } - static BufferRegion CreateRegion(const BufferLoad& load) { - Array region; - region.reserve(load->indices.size()); - for (const PrimExpr& index : load->indices) { - region.push_back(Range::FromMinExtent(index, 1)); + void CreateRegion(const Array& buf_loads, bool is_read) { + Array& buf_regions = is_read ? read_regions_ : write_regions_; + for (const PrimExpr& expr : buf_loads) { + const auto* buf_load = expr.as(); + ICHECK(buf_load != nullptr); + Array region; + region.reserve(buf_load->indices.size()); + for (const PrimExpr& index : buf_load->indices) { + region.push_back(Range::FromMinExtent(index, 1)); + } + buf_regions.push_back(BufferRegion(buf_load->buffer, std::move(region))); } - return BufferRegion(load->buffer, std::move(region)); } private: /*! \brief The new created additional block iter of the rfactor block */ IterVar rf_additional_iter_; - /*! \brief The lhs of the combiner in the reduction update of the old block */ - PrimExpr combiner_lhs_; - /*! \brief The lhs of the combiner of the write-back block */ - BufferLoad wb_lhs_; - /*! \brief The rhs of the combiner of the write-back block */ - BufferLoad wb_rhs_; + /*! \brief The LHS values of the reduction in the old block */ + Array combiner_lhs_; }; /*! @@ -924,14 +1053,16 @@ class BlockReplacer : public StmtMutator { BlockRealize wb_block_realize, BlockRealize old_block_realize, For rf_loop, std::unordered_set reduce_loop_vars, std::unordered_map loop_vars2loop, - const Buffer& rf_buffer) { + const Array& rf_buffers) { BlockReplacer replacer(std::move(rf_body), std::move(outermost_loop), std::move(wb_block_realize), std::move(old_block_realize), std::move(rf_loop), std::move(reduce_loop_vars), std::move(loop_vars2loop)); Block new_scope_root = Downcast(replacer(std::move(scope_root_block))); BlockNode* p = new_scope_root.CopyOnWrite(); - p->alloc_buffers.push_back(rf_buffer); + for (const Buffer& rf_buffer : rf_buffers) { + p->alloc_buffers.push_back(rf_buffer); + } return new_scope_root; } @@ -1040,13 +1171,19 @@ StmtSRef RFactor(ScheduleState self, const StmtSRef& rf_loop_sref, int factor_ax // commutative reducer, combiner lhs and combiner rhs from the reduction identity and the // reduction combiner. The lhs will be used when constructing the write-back block, and the rhs // will be used when constructing the rfactor block. - auto [init, update] = GetBufferStoresFromReductionBlock(self, block); - auto [reducer, combiner_lhs, combiner_rhs] = - GetReducerAndCombinerLhsRhs(self, init->value, update); + Array init_values{nullptr}; + Array updates{nullptr}; + CommReducer reducer{nullptr}; + Array combiner_lhs{nullptr}; + Array combiner_rhs{nullptr}; + std::tie(init_values, updates) = GetInitValuesAndUpdatesFromReductionBlock(self, block); + std::tie(reducer, combiner_lhs, combiner_rhs) = + GetReducerAndCombinerLhsRhs(self, init_values, updates); // Step 6. Check whether `factor_axis` is in a correct range, and convert it to non-negative if it // is negative. - factor_axis = FactorAxisOutOfRangeError::CheckAndUpdate(self->mod, update->buffer, factor_axis); + factor_axis = + FactorAxisOutOfRangeError::CheckAndUpdate(self->mod, updates[0]->buffer, factor_axis); // ***************************************************** // * IR Manipulation * @@ -1056,17 +1193,17 @@ StmtSRef RFactor(ScheduleState self, const StmtSRef& rf_loop_sref, int factor_ax // Step 1. Create the intermediate buffer (a.k.a. rfactor buffer), which has an additional // dimension that specified by `factor_axis` and `rf_loop`. - Buffer rf_buffer = CreateRFactorBuffer(update->buffer, factor_axis, rf_loop); + Array rf_buffers = CreateRFactorBuffers(updates, factor_axis, rf_loop); // Step 2. Create the rfactor block. - RFactorBlockCreator rf_block_creator(block_realize, GetRef(rf_loop), update, reducer, - rf_buffer, loop_vars2loop, factor_axis, + RFactorBlockCreator rf_block_creator(block_realize, GetRef(rf_loop), updates, reducer, + rf_buffers, loop_vars2loop, factor_axis, std::move(combiner_rhs)); rf_block_creator.CreateBlock(); // Step 3. Create the write-back block. - WriteBackBlockCreator wb_block_creator(block_realize, GetRef(rf_loop), update, reducer, - rf_buffer, std::move(rf_block_creator.additional_iter_), + WriteBackBlockCreator wb_block_creator(block_realize, GetRef(rf_loop), updates, reducer, + rf_buffers, std::move(rf_block_creator.additional_iter_), std::move(combiner_lhs), std::move(rf_block_creator.rf_buf_access_indices_)); wb_block_creator.CreateBlock(); @@ -1082,7 +1219,7 @@ StmtSRef RFactor(ScheduleState self, const StmtSRef& rf_loop_sref, int factor_ax Block old_scope_root_block = GetRef(scope_root->StmtAs()); Block new_scope_root_block = BlockReplacer::Replace( old_scope_root_block, rf_body, loops[0], wb_block_creator.new_block_realize_, block_realize, - GetRef(rf_loop), reduce_loop_vars, loop_vars2loop, rf_buffer); + GetRef(rf_loop), reduce_loop_vars, loop_vars2loop, rf_buffers); self->Replace( scope_root, new_scope_root_block, {{old_scope_root_block, new_scope_root_block}, {block, wb_block_creator.new_block_}}); @@ -1157,8 +1294,9 @@ TVM_REGISTER_INST_KIND_TRAITS(DecomposeReductionTraits); /******** FFI ********/ TVM_REGISTER_GLOBAL("tir.schedule.RegisterReducer") - .set_body_typed([](PackedFunc combiner_getter, PackedFunc identity_getter) { - ReducerRegistry::RegisterReducer(std::move(combiner_getter), std::move(identity_getter)); + .set_body_typed([](int n_buffers, PackedFunc combiner_getter, PackedFunc identity_getter) { + ReducerRegistry::RegisterReducer(n_buffers, std::move(combiner_getter), + std::move(identity_getter)); }); } // namespace tir diff --git a/src/tir/transforms/lower_cross_thread_reduction.cc b/src/tir/transforms/lower_cross_thread_reduction.cc index 04b025b5f9ae..c10555e74d07 100644 --- a/src/tir/transforms/lower_cross_thread_reduction.cc +++ b/src/tir/transforms/lower_cross_thread_reduction.cc @@ -111,70 +111,66 @@ bool IsReductionBlock(const BlockRealize& realize, const Map& loop_r } /*! - * \brief Create an intermediate buffer with specified name and data type - * \param name The specified name - * \param dtype The specified data type - * \return The created buffer + * \brief Create intermediate buffers according to the input buffers and buffer kind + * \param reduction_buffers The old reduction buffers which provide the buffer names and data types + * \param is_cross_thread_buffer A boolean indicating whether to create buffers for the cross-thread + * computation results or not, which is used for determine the buffer name prefix + * \return The created buffers */ -Buffer MakeScratchpad(String name, const DataType& dtype) { - return Buffer(/*ptr=*/Var(name, PointerType(PrimType(dtype), "local")), - /*dtype=*/dtype, - /*shape=*/{Integer(1)}, - /*strides=*/{Integer(1)}, - /*elem_offset=*/PrimExpr{nullptr}, - /*name=*/name, - /*data_alignment=*/0, - /*offset_factor=*/0, - /*buffer_type=*/kDefault); -} - -/*! - * \brief Remove the BufferRegions whose buffer is the input buffer - * \param buffer_regions The array of BufferRegions to be - * \param buffer_to_remove The specified buffer - * \return The mutated array of BufferRegions, no longer containing BufferRegion of the input buffer - */ -Array RemoveBufferFromBufferRegions(const Array& buffer_regions, - const Buffer& buffer_to_remove) { - Array res; - res.reserve(buffer_regions.size()); - for (const BufferRegion& buffer_region : buffer_regions) { - if (!buffer_region->buffer.same_as(buffer_to_remove)) { - res.push_back(buffer_region); - } +Array MakeScratchpads(const Array& reduction_buffers, bool is_cross_thread_buffer) { + Array new_buffers; + new_buffers.reserve(reduction_buffers.size()); + for (const Buffer& buffer : reduction_buffers) { + String name = is_cross_thread_buffer ? "cross" : "in"; + name = name + "_thread_" + buffer->name; + new_buffers.push_back(Buffer(/*ptr=*/Var(name, PointerType(PrimType(buffer->dtype), "local")), + /*dtype=*/buffer->dtype, + /*shape=*/{Integer(1)}, + /*strides=*/{Integer(1)}, + /*elem_offset=*/PrimExpr{nullptr}, + /*name=*/name, + /*data_alignment=*/0, + /*offset_factor=*/0, + /*buffer_type=*/kDefault)); } - return res; + return new_buffers; } /*! - * \brief Substitute a given source buffer with a given target buffer in statements or expressions + * \brief Substitute given source buffers with given target buffers respectively in the input + * statement */ class BufferReplacer : private StmtExprMutator { public: - static Stmt Run(Buffer src_buffer, Buffer tgt_buffer, Stmt stmt) { - return BufferReplacer(src_buffer, tgt_buffer)(std::move(stmt)); + static Stmt Run(Array src_buffers, Array tgt_buffers, Stmt stmt) { + Map buffer_map; + ICHECK_EQ(src_buffers.size(), tgt_buffers.size()); + int n_buffers = src_buffers.size(); + for (int i = 0; i < n_buffers; ++i) { + buffer_map.Set(src_buffers[i], tgt_buffers[i]); + } + return BufferReplacer(buffer_map)(std::move(stmt)); } private: - explicit BufferReplacer(Buffer src_buffer, Buffer tgt_buffer) - : src_buffer_(std::move(src_buffer)), tgt_buffer_(std::move(tgt_buffer)) {} + explicit BufferReplacer(Map buffer_map) : buffer_map_(std::move(buffer_map)) {} PrimExpr VisitExpr_(const BufferLoadNode* load) final { - return load->buffer.same_as(src_buffer_) ? BufferLoad(tgt_buffer_, {0}) - : GetRef(load); + auto it = buffer_map_.find(load->buffer); + return it != buffer_map_.end() ? BufferLoad((*it).second, {0}) : GetRef(load); } Stmt VisitStmt_(const BufferStoreNode* store) final { - if (store->buffer.same_as(src_buffer_)) { + auto it = buffer_map_.find(store->buffer); + if (it != buffer_map_.end()) { PrimExpr value = StmtExprMutator::VisitExpr(store->value); - return BufferStore(tgt_buffer_, value, {0}); + return BufferStore((*it).second, std::move(value), {0}); } else { return StmtMutator::VisitStmt_(store); } } - Buffer src_buffer_; - Buffer tgt_buffer_; + Map buffer_map_; }; /*! @@ -231,25 +227,40 @@ class InThreadReducerMaker : private StmtMutator { /*! * \brief Create the lowered allreduce block transformed from the input reduction block - * \param reduction_block The input reduction block - * \param it_buffer The buffer to store in-thread reduction results - * \param ct_buffer The buffer to store cross-thread reduction results + * \param realize The block-realize which contains the old reduction block + * \param it_buffers The buffers to store in-thread reduction results + * \param ct_buffers The buffers to store cross-thread reduction results + * \param wb_buffers The buffers to store the final reduction results + * \param old_wb_indices The indices used to access the write-back buffers when storing the final + * reduction results into the write-back buffers * \param reducer The reduction function - * \param combiner_rhs The RHS of the combiner + * \param combiner_rhs The RHS values of the combiner * \param reduction_loops The reduction loops */ -Stmt TransformReductionBlock(const BlockRealizeNode* realize, const Optional& it_buffer, - const Buffer& ct_buffer, const CommReducer& reducer, - const PrimExpr& combiner_rhs, +Stmt TransformReductionBlock(const BlockRealizeNode* realize, // + const Optional>& it_buffers, // + const Array& ct_buffers, // + const Array& wb_buffers, // + const Array& old_wb_indices, // + const CommReducer& reducer, // + const Array& combiner_rhs, // const std::vector& reduction_loops) { + int n_buffers = wb_buffers.size(); const BlockNode* block = realize->block.get(); - Buffer wb_buffer = block->writes[0]->buffer; - Array wb_region = block->writes[0]->region; - BufferRegion ct_buffer_region(ct_buffer, {Range::FromMinExtent(0, 1)}); - Optional it_buffer_region = NullOpt; - if (it_buffer.defined()) { - it_buffer_region = BufferRegion(it_buffer.value(), {Range::FromMinExtent(0, 1)}); + auto f_create_buffer_regions = [](Array buffers) { + Array regions; + regions.reserve(buffers.size()); + for (const Buffer& buffer : buffers) { + regions.push_back(BufferRegion(buffer, {Range::FromMinExtent(0, 1)})); + } + return regions; + }; + + Array ct_buffer_regions = f_create_buffer_regions(ct_buffers); + Optional> it_buffer_regions = NullOpt; + if (it_buffers.defined()) { + it_buffer_regions = f_create_buffer_regions(it_buffers.value()); } // In total, the block is transformed into at most 4 statements // - Stmt 1: initialize the buffer for in-thread reduction @@ -259,35 +270,35 @@ Stmt TransformReductionBlock(const BlockRealizeNode* realize, const Optional stmts; stmts.reserve(4); // Stmt 1: initialize the buffer for in-thread reduction - if (it_buffer.defined()) { - BufferStore init = Downcast(block->init); - stmts.push_back(BlockRealize( - /*iter_values=*/{}, - /*predicate=*/const_true(), - /*block=*/ - Block(/*iter_vars=*/{}, - /*reads=*/{}, - /*writes=*/{it_buffer_region.value()}, - /*name_hint=*/block->name_hint + "_in_thread_init", - /*body=*/ - BufferStore(/*buffer=*/it_buffer.value(), - /*value=*/init->value, - /*indices=*/{Integer(0)})))); + if (it_buffers.defined()) { + Array inits; + inits.reserve(n_buffers); + for (int i = 0; i < n_buffers; ++i) { + inits.push_back( + BufferStore(it_buffers.value()[i], reducer->identity_element[i], {Integer(0)})); + } + stmts.push_back(BlockRealize(/*iter_values=*/{}, + /*predicate=*/const_true(), + /*block=*/ + Block(/*iter_vars=*/{}, + /*reads=*/{}, + /*writes=*/it_buffer_regions.value(), + /*name_hint=*/block->name_hint + "_in_thread_init", + /*body=*/n_buffers > 1 ? SeqStmt(inits) : inits[0]))); } // Stmt 2: do in-thread reduction { Optional new_realize = NullOpt; // If need to generate in-thread reduction, - // then replace `wb_buffer` with `it_buffer` accordingly in given BlockRealize + // then replace `wb_buffers` with `it_buffers` accordingly in given BlockRealize // otherwise, directly remove given BlockRealize - if (it_buffer.defined()) { + if (it_buffers.defined()) { ObjectPtr new_block = make_object(*block); - new_block->reads = RemoveBufferFromBufferRegions(std::move(new_block->reads), wb_buffer); - new_block->reads.push_back(it_buffer_region.value()); - new_block->writes = {it_buffer_region.value()}; + new_block->reads = std::move(new_block->reads); + new_block->writes = it_buffer_regions.value(); new_block->name_hint = new_block->name_hint + "_in_thread"; new_block->body = - BufferReplacer::Run(wb_buffer, it_buffer.value(), std::move(new_block->body)); + BufferReplacer::Run(wb_buffers, it_buffers.value(), std::move(new_block->body)); new_block->init = NullOpt; ObjectPtr n = make_object(*realize); n->block = Block(new_block); @@ -303,19 +314,23 @@ Stmt TransformReductionBlock(const BlockRealizeNode* realize, const Optional parameters; parameters.reserve(reduction_loops.size() + 4); - // 1-st argument: size - parameters.push_back(make_const(DataType::UInt(32), 1)); - // 2-nd argument: source - if (it_buffer.defined()) { - parameters.push_back(BufferLoad(it_buffer.value(), {Integer(0)})); + // 1-st argument: number of buffers + parameters.push_back(make_const(DataType::UInt(32), n_buffers)); + // Next `n_buffers` arguments: sources + if (it_buffers.defined()) { + for (int i = 0; i < n_buffers; ++i) { + parameters.push_back(BufferLoad(it_buffers.value()[i], {Integer(0)})); + } } else { - parameters.push_back(combiner_rhs); + parameters.insert(parameters.end(), combiner_rhs.begin(), combiner_rhs.end()); } - // 3-rd argument: predicate + // Next argument: predicate parameters.push_back(const_true()); - // 4-th argument: destination - parameters.push_back(BufferLoad(ct_buffer, {0})); - // next arguments: all the reduction threads + // Next `n_buffers` arguments: destinations + for (int i = 0; i < n_buffers; ++i) { + parameters.push_back(BufferLoad(ct_buffers[i], {0})); + } + // Next arguments: all the reduction threads for (const ForNode* reduction_loop : reduction_loops) { if (reduction_loop->thread_binding.defined()) { parameters.push_back(reduction_loop->loop_var); @@ -325,14 +340,14 @@ Stmt TransformReductionBlock(const BlockRealizeNode* realize, const Optional iter_vars{nullptr}; Array bindings{nullptr}; Array reads{nullptr}; - if (it_buffer.defined()) { + if (it_buffers.defined()) { iter_vars = Array{}; bindings = Array{}; - reads = {it_buffer_region.value()}; + reads = it_buffer_regions.value(); } else { iter_vars = block->iter_vars; bindings = realize->iter_values; - reads = {RemoveBufferFromBufferRegions(block->reads, wb_buffer)}; + reads = block->reads; } stmts.push_back(BlockRealize( /*iter_values=*/std::move(bindings), @@ -340,7 +355,7 @@ Stmt TransformReductionBlock(const BlockRealizeNode* realize, const Optionalname_hint + "_cross_thread", /*body=*/ AttrStmt(/*node=*/reducer, @@ -376,21 +391,31 @@ Stmt TransformReductionBlock(const BlockRealizeNode* realize, const Optionalvar, new_iter_var->var); } } - BufferStore update = Downcast(block->body); - update = Downcast(Substitute(std::move(update), var_map)); + Array wb_updates; + Array wb_regions; + wb_updates.reserve(n_buffers); + wb_regions.reserve(n_buffers); + int n_dim = static_cast(old_wb_indices.size()); + Array region = Substitute(block->writes[0]->region, var_map); + Array wb_indices; + wb_indices.reserve(n_dim); + for (int d = 0; d < n_dim; ++d) { + wb_indices.push_back(Substitute(old_wb_indices[d], var_map)); + } + for (int i = 0; i < n_buffers; ++i) { + wb_updates.push_back( + BufferStore(wb_buffers[i], BufferLoad(ct_buffers[i], {Integer(0)}), wb_indices)); + wb_regions.push_back(BufferRegion(wb_buffers[i], region)); + } stmts.push_back(BlockRealize( /*iter_values=*/std::move(bindings), /*predicate=*/const_true(), /*block=*/ - Block( - /*iter_vars=*/std::move(iter_vars), - /*reads=*/{std::move(ct_buffer_region)}, - /*writes=*/{BufferRegion(wb_buffer, Substitute(wb_region, var_map))}, - /*name_hint=*/block->name_hint + "_write_back", - /*body=*/ - BufferStore(/*buffer=*/wb_buffer, - /*value=*/BufferLoad(ct_buffer, {Integer(0)}), - /*indices=*/update->indices)))); + Block(/*iter_vars=*/std::move(iter_vars), + /*reads=*/std::move(ct_buffer_regions), + /*writes=*/std::move(wb_regions), + /*name_hint=*/block->name_hint + "_write_back", + /*body=*/n_buffers > 1 ? SeqStmt(wb_updates) : wb_updates[0]))); } // Final step: Wrap all the above four statements with the reduction loops bound to threadIdx Stmt new_stmt = SeqStmt::Flatten(std::move(stmts)); @@ -447,18 +472,23 @@ class CrossThreadReductionTransformer : public StmtMutator { return need ? reduction_loops : std::vector{}; } - // Given that the input block needs cross-thread reduction, check if cross-thread reduction can - // be applied to the block (i.e., the block satisfies all necessary conditions of cross-thread - // reduction). - std::tuple CheckCanApplyCrossThreadReduction( - const BlockNode* block, const std::vector& reduction_loops) const { - // Condition 1. The block being applied cross-thread reduction should write to single buffer. - CHECK_EQ(block->writes.size(), 1) - << "ValueError: Cross-thread reduction requires the block to only " - "write to single buffer. However, the block " - << block->name_hint << " writes to " << block->writes.size() << " buffer(s)."; - - // Condition 2. All the reduction-related loops should be the deepest among all statements + /*! + * \brief Given that the input block needs cross-thread reduction, check if cross-thread reduction + * can be applied to the block (i.e., the block satisfies all necessary conditions of cross-thread + * reduction) + * \param block The block to be checked + * \param reduction_loops The reduction loops above the block + * \return A tuple consisting of five elements: + * - an integer which indicates the number of reduction loops that are bound to thread axes, + * - the detected commutative reducer of the reduction, + * - the reduction buffers which store the reduction results, + * - the RHS values of the reduction updates, + * - the indices which is used to access the reduction buffers when storing the reduction results + */ + std::tuple, Array, Array> + CheckCanApplyCrossThreadReduction(const BlockNode* block, + const std::vector& reduction_loops) const { + // Condition 1. All the reduction-related loops should be the deepest among all statements // outside the block (ignoring SeqStmt here). int n_deepest_reduction_loops = 0; for (auto rit = statement_stack_.rbegin() + 1; rit != statement_stack_.rend(); ++rit) { @@ -480,7 +510,7 @@ class CrossThreadReductionTransformer : public StmtMutator { << " needs cross-thread reduction, while the reduction-related loops outside of it are not " "the deepest statements, which violates the condition."; - // Condition 3. All the reduction-related loops that are bound to thread axes should only be + // Condition 2. All the reduction-related loops that are bound to thread axes should only be // bound to `threadIdx.x/y/z`. int n_bound_reduction_loops = 0; for (const ForNode* reduction_loop : reduction_loops) { @@ -493,16 +523,26 @@ class CrossThreadReductionTransformer : public StmtMutator { } } - // Condition 4. Get the `init` identity and the `update` combiner of the reduction. They should - // both be BufferStores with the same buffer and indices; - // Extract the commutative reducer, combiner lhs and combiner rhs from the reduction identity - // and the reduction combiner. - auto [init, update] = GetBufferStoresFromReductionBlock(NullOpt, GetRef(block)); - auto [reducer, combiner_lhs, combiner_rhs] = - GetReducerAndCombinerLhsRhs(NullOpt, init->value, update); - (void)combiner_lhs; // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767 + // Condition 3. Get the identity values of the block init and the BufferStore block combiner + // updates of the reduction. Extract the commutative reducer, combiner lhs and combiner rhs from + // the reduction identities and the reduction combiner. + Array init_values{nullptr}; + Array updates{nullptr}; + CommReducer reducer{nullptr}; + Array combiner_lhs{nullptr}; + Array combiner_rhs{nullptr}; + std::tie(init_values, updates) = + GetInitValuesAndUpdatesFromReductionBlock(NullOpt, GetRef(block)); + std::tie(reducer, combiner_lhs, combiner_rhs) = + GetReducerAndCombinerLhsRhs(NullOpt, init_values, updates); + + Array reduction_buffers; + reduction_buffers.reserve(updates.size()); + for (const BufferStore& buf_store : updates) { + reduction_buffers.push_back(buf_store->buffer); + } - // Condition 5. The block should be the last block under the first reduction-related loop. + // Condition 4. The block should be the last block under the first reduction-related loop. bool visit = false; PreOrderVisit(GetRef(reduction_loops[0]), [block, &visit](const ObjectRef& obj) { if (const auto* realize = obj.as()) { @@ -515,7 +555,11 @@ class CrossThreadReductionTransformer : public StmtMutator { } return true; }); - return std::make_tuple(n_bound_reduction_loops, reducer, combiner_rhs); + return std::make_tuple(n_bound_reduction_loops, // + std::move(reducer), // + std::move(reduction_buffers), // + std::move(combiner_rhs), // + updates[0]->indices); } Stmt VisitStmt(const Stmt& stmt) final { @@ -570,10 +614,14 @@ class CrossThreadReductionTransformer : public StmtMutator { if (reduction_loops.empty()) { return StmtMutator::VisitStmt_(realize); } - ++reduction_id_; // Step 2. Check whether cross-thread reduction can be applied. If no, throw an exception on // which condition the block violates. - auto [n_bound_reduction_loops, reducer, combiner_rhs] = + int n_bound_reduction_loops = 0; + CommReducer reducer{nullptr}; + Array reduction_buffers{nullptr}; + Array combiner_rhs{nullptr}; + Array wb_indices{nullptr}; + std::tie(n_bound_reduction_loops, reducer, reduction_buffers, combiner_rhs, wb_indices) = CheckCanApplyCrossThreadReduction(block, reduction_loops); // Step 3. Before doing the cross-thread reduction, in-thread reduction is needed when // - not all the reduction-related loops are bound to thread axes, or @@ -581,31 +629,30 @@ class CrossThreadReductionTransformer : public StmtMutator { bool need_in_thread_reduction = n_bound_reduction_loops < static_cast(reduction_loops.size()) || !is_one(realize->predicate); - // Step 4. Create intermediate buffers, storing them in `ct_buffer` and - // `it_buffer`. Let the scope block allocate these new buffers. - std::vector& new_buffers = block2new_buffers_[block_stack_.back()]; - DataType dtype = block->writes[0]->buffer->dtype; - Buffer ct_buffer = MakeScratchpad("cross_thread_" + std::to_string(reduction_id_), dtype); - new_buffers.push_back(ct_buffer); - Optional it_buffer = NullOpt; + // Step 4. Create intermediate buffers, storing them in `ct_buffers` and + // `it_buffers`. Let the scope block allocate these new buffers. + Array& new_buffers = block2new_buffers_[block_stack_.back()]; + Array ct_buffers = MakeScratchpads(reduction_buffers, /*is_cross_thread_buffer=*/true); + new_buffers.insert(new_buffers.end(), ct_buffers.begin(), ct_buffers.end()); + Optional> it_buffers = NullOpt; if (need_in_thread_reduction) { - it_buffer = MakeScratchpad("in_thread_" + std::to_string(reduction_id_), dtype); - new_buffers.push_back(it_buffer.value()); + it_buffers = MakeScratchpads(reduction_buffers, /*is_cross_thread_buffer=*/false); + new_buffers.insert(new_buffers.end(), it_buffers.value().begin(), it_buffers.value().end()); } // Step 5. Transform. - loop2new_stmt_[reduction_loops[0]] = TransformReductionBlock( - realize, it_buffer, ct_buffer, reducer, combiner_rhs, reduction_loops); + loop2new_stmt_[reduction_loops[0]] = + TransformReductionBlock(realize, it_buffers, ct_buffers, reduction_buffers, wb_indices, + reducer, combiner_rhs, reduction_loops); // Step 6. Return an empty statement, because the transformation result will be inserted when // returning to the first reduction-related loop. return Stmt{nullptr}; } private: - int reduction_id_ = -1; std::vector statement_stack_; std::vector loop_stack_; std::vector block_stack_; - std::unordered_map> block2new_buffers_; + std::unordered_map> block2new_buffers_; std::unordered_map loop2new_stmt_; Map loop_range_map_; arith::Analyzer analyzer_; diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py index 17f42654fcf7..70b49944ba0f 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py @@ -119,5 +119,171 @@ def cpu_matmul_2( ) +def test_cpu_argmax(): + @T.prim_func + def argmax( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], + ) -> None: + for i0, i1 in T.grid(128, 128): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.min_value("float32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select( + argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k] + ) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + @T.prim_func + def argmax_0( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[128, "int32"], + argmax_v1: T.Buffer[128, "float32"], + ) -> None: + for i0, i1 in T.grid(128, 128): + with T.block("argmax"): + i, k = T.axis.remap("SR", [i0, i1]) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.float32(-3.4028234663852886e38) + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select( + argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k] + ) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + @T.prim_func + def argmax_1( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[128, "int32"], + argmax_v1: T.Buffer[128, "float32"], + ) -> None: + argmax_v0_rf = T.alloc_buffer([128, 16], dtype="int32") + argmax_v1_rf = T.alloc_buffer([128, 16], dtype="float32") + for i0, i1_0, i1_1 in T.grid(128, 8, 16): + with T.block("argmax_rf"): + vi1_1, i, vi1_0 = T.axis.remap("SSR", [i1_1, i0, i1_0]) + T.reads(idx[i, vi1_0 * 16 + vi1_1], val[i, vi1_0 * 16 + vi1_1]) + T.writes(argmax_v0_rf[i, vi1_1], argmax_v1_rf[i, vi1_1]) + with T.init(): + argmax_v0_rf[i, vi1_1] = -1 + argmax_v1_rf[i, vi1_1] = T.float32(-3.4028234663852886e38) + v_argmax_v0_rf: T.int32 = T.Select( + argmax_v1_rf[i, vi1_1] >= val[i, vi1_0 * 16 + vi1_1], + argmax_v0_rf[i, vi1_1], + idx[i, vi1_0 * 16 + vi1_1], + ) + v_argmax_v1_rf: T.float32 = T.Select( + argmax_v1_rf[i, vi1_1] >= val[i, vi1_0 * 16 + vi1_1], + argmax_v1_rf[i, vi1_1], + val[i, vi1_0 * 16 + vi1_1], + ) + argmax_v0_rf[i, vi1_1] = v_argmax_v0_rf + argmax_v1_rf[i, vi1_1] = v_argmax_v1_rf + for i0, i1_1 in T.grid(128, 16): + with T.block("argmax"): + vi1_1, i = T.axis.remap("RS", [i1_1, i0]) + T.reads(argmax_v0_rf[i, vi1_1], argmax_v1_rf[i, vi1_1]) + T.writes(argmax_v0[i], argmax_v1[i]) + T.block_attr({"meta_schedule.random_compute_producer": 1}) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.float32(-3.4028234663852886e38) + v_argmax_v0: T.int32 = T.Select( + argmax_v1[i] >= argmax_v1_rf[i, vi1_1], argmax_v0[i], argmax_v0_rf[i, vi1_1] + ) + v_argmax_v1: T.float32 = T.Select( + argmax_v1[i] >= argmax_v1_rf[i, vi1_1], argmax_v1[i], argmax_v1_rf[i, vi1_1] + ) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + @T.prim_func + def argmax_2( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[128, "int32"], + argmax_v1: T.Buffer[128, "float32"], + ) -> None: + # body + # with T.block("root") + argmax_v0_rf = T.alloc_buffer([128, 8], dtype="int32") + argmax_v1_rf = T.alloc_buffer([128, 8], dtype="float32") + for i0, i1_0, i1_1 in T.grid(128, 8, 16): + with T.block("argmax_rf"): + vi1_0, i, vi1_1 = T.axis.remap("SSR", [i1_0, i0, i1_1]) + T.reads(idx[i, vi1_0 * 16 + vi1_1], val[i, vi1_0 * 16 + vi1_1]) + T.writes(argmax_v0_rf[i, vi1_0], argmax_v1_rf[i, vi1_0]) + with T.init(): + argmax_v0_rf[i, vi1_0] = -1 + argmax_v1_rf[i, vi1_0] = T.float32(-3.4028234663852886e38) + v_argmax_v0_rf: T.int32 = T.Select( + argmax_v1_rf[i, vi1_0] >= val[i, vi1_0 * 16 + vi1_1], + argmax_v0_rf[i, vi1_0], + idx[i, vi1_0 * 16 + vi1_1], + ) + v_argmax_v1_rf: T.float32 = T.Select( + argmax_v1_rf[i, vi1_0] >= val[i, vi1_0 * 16 + vi1_1], + argmax_v1_rf[i, vi1_0], + val[i, vi1_0 * 16 + vi1_1], + ) + argmax_v0_rf[i, vi1_0] = v_argmax_v0_rf + argmax_v1_rf[i, vi1_0] = v_argmax_v1_rf + for i0, i1_0 in T.grid(128, 8): + with T.block("argmax"): + vi1_0, i = T.axis.remap("RS", [i1_0, i0]) + T.reads(argmax_v0_rf[i, vi1_0], argmax_v1_rf[i, vi1_0]) + T.writes(argmax_v0[i], argmax_v1[i]) + T.block_attr({"meta_schedule.random_compute_producer": 1}) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.float32(-3.4028234663852886e38) + v_argmax_v0: T.int32 = T.Select( + argmax_v1[i] >= argmax_v1_rf[i, vi1_0], argmax_v0[i], argmax_v0_rf[i, vi1_0] + ) + v_argmax_v1: T.float32 = T.Select( + argmax_v1[i] >= argmax_v1_rf[i, vi1_0], argmax_v1[i], argmax_v1_rf[i, vi1_0] + ) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + decision_0 = [] # type: ignore + decision_1 = [ + ("SamplePerfectTile", [8, 16]), + ] + decision_2 = [ + ("SamplePerfectTile", [8, 16]), + ] + mod = argmax + actual = ms.TuneContext( + mod=mod, + target=Target("llvm --num-cores=32"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ms.schedule_rule.AddRFactor()], + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[argmax_0, argmax_1, argmax_2], + expected_decisions=[decision_0, decision_1, decision_2], + ) + + if __name__ == "__main__": test_cpu_matmul() + test_cpu_argmax() diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py index a0ca47c09a34..ab8df6678b0b 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py @@ -572,7 +572,106 @@ def batch_norm_bmn_1(A: T.Buffer[(1, 512, 512), "float32"], D: T.Buffer[1, "floa ) +@T.prim_func +def argmax( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1 in T.grid(128, 128): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.min_value("float32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + +def test_gpu_argmax(): + @T.prim_func + def argmax_0( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[128, "int32"], + argmax_v1: T.Buffer[128, "float32"], + ) -> None: + # body + # with T.block("root") + for i0, i1 in T.grid(128, 128): + with T.block("argmax"): + i, k = T.axis.remap("SR", [i0, i1]) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.float32(-3.4028234663852886e38) + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select( + argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k] + ) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + @T.prim_func + def argmax_1( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[128, "int32"], + argmax_v1: T.Buffer[128, "float32"], + ) -> None: + # body + # with T.block("root") + for i0, i1_0 in T.grid(128, 2): + for i1_1 in T.thread_binding(64, thread="threadIdx.x"): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 64 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.float32(-3.4028234663852886e38) + v_argmax_v0: T.int32 = T.Select( + argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k] + ) + v_argmax_v1: T.float32 = T.Select( + argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k] + ) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + decision_0 = [] # type: ignore + decision_1 = [ + ("SampleCategorical", 4), + ] + + mod = argmax + actual = ms.TuneContext( + mod=mod, + target=Target("nvidia/geforce-rtx-3090", host="llvm"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]) + ], + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[argmax_0, argmax_1], + expected_decisions=[decision_0, decision_1], + ) + + if __name__ == "__main__": test_gpu_softmax_mn() test_gpu_softmax_mn_after_inline() test_gpu_batch_norm_bmn() + test_gpu_argmax() diff --git a/tests/python/unittest/test_tir_schedule_rfactor.py b/tests/python/unittest/test_tir_schedule_rfactor.py index 4078b1e89682..f6db79f3ed23 100644 --- a/tests/python/unittest/test_tir_schedule_rfactor.py +++ b/tests/python/unittest/test_tir_schedule_rfactor.py @@ -29,9 +29,9 @@ @T.prim_func def transformed_matmul(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, [128, 128]) - B = T.match_buffer(b, [128, 128]) - C = T.match_buffer(c, [128, 128]) + A = T.match_buffer(a, [128, 128], dtype="float32") + B = T.match_buffer(b, [128, 128], dtype="float32") + C = T.match_buffer(c, [128, 128], dtype="float32") for i0, i1, i2_outer, i2_inner_outer, i2_inner_inner in T.grid(128, 128, 4, 8, 4): with T.block("update"): @@ -44,12 +44,30 @@ def transformed_matmul(a: T.handle, b: T.handle, c: T.handle) -> None: C[vi, vj] = C[vi, vj] + (A[vi, vk] * B[vj, vk]) +@T.prim_func +def transformed_matmul_with_let(a: T.handle, b: T.handle, c: T.handle) -> None: + A = T.match_buffer(a, [128, 128], dtype="float32") + B = T.match_buffer(b, [128, 128], dtype="float32") + C = T.match_buffer(c, [128, 128], dtype="float32") + + for i0, i1, i2_outer, i2_inner_outer, i2_inner_inner in T.grid(128, 128, 4, 8, 4): + with T.block("update"): + vi, vj = T.axis.remap("SS", [i0, i1]) + vk = T.axis.R(128, i2_outer * 32 + i2_inner_outer * 4 + i2_inner_inner) + T.reads([A[vi, vk], B[vj, vk]]) + T.writes([C[vi, vj]]) + with T.init(): + C[vi, vj] = 0.0 + v_C: T.float32 = C[vi, vj] + (A[vi, vk] * B[vj, vk]) + C[vi, vj] = v_C + + @T.prim_func def matmul_rfactor(a: T.handle, b: T.handle, c: T.handle) -> None: - A = T.match_buffer(a, [128, 128]) - B = T.match_buffer(b, [128, 128]) - C = T.match_buffer(c, [128, 128]) - C_rf = T.alloc_buffer([4, 128, 128]) + A = T.match_buffer(a, [128, 128], dtype="float32") + B = T.match_buffer(b, [128, 128], dtype="float32") + C = T.match_buffer(c, [128, 128], dtype="float32") + C_rf = T.alloc_buffer([4, 128, 128], dtype="float32") for i0, i1, i2_outer, i2_inner_outer, i2_inner_inner in T.grid(128, 128, 4, 8, 4): with T.block("update_rf"): @@ -436,6 +454,20 @@ def rowsum_wrong_reduce_pattern2(a: T.handle, b: T.handle) -> None: B[vi] = B[vi] - A[vi, vk] +@T.prim_func +def rowsum_init_not_bufferstore(a: T.handle, b: T.handle) -> None: + A = T.match_buffer(a, (128, 128)) + B = T.match_buffer(b, (128,)) + + for i, k in T.grid(128, 128): + with T.block("B"): + vi, vk = T.axis.remap("SR", [i, k]) + with T.init(): + v_init: T.float32 = T.float32(0) + B[vi] = v_init + B[vi] = B[vi] + A[vi, vk] + + @T.prim_func def rowsum_transformed(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (128, 128)) @@ -654,6 +686,453 @@ def rfactor_spatial_only_after( B[ax0, ax1, ax2, ax3] = B[ax0, ax1, ax2, ax3] + B_rf[ax0, ax1, ax2, ax3, vi4] +@T.prim_func +def argmax_split( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.min_value("float32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + +@T.prim_func +def argmin_split_init_update_reordered( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmin_v0: T.Buffer[(128,), "int32"], + argmin_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmin"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmin_v0[i], argmin_v1[i]) + with T.init(): + argmin_v1[i] = T.max_value("float32") + argmin_v0[i] = -1 + v_argmin_v0: T.int32 = T.Select(argmin_v1[i] <= val[i, k], argmin_v0[i], idx[i, k]) + v_argmin_v1: T.float32 = T.Select(argmin_v1[i] <= val[i, k], argmin_v1[i], val[i, k]) + argmin_v1[i] = v_argmin_v1 + argmin_v0[i] = v_argmin_v0 + + +@T.prim_func +def argmax_split_different_shape( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(256,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.min_value("float32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + +@T.prim_func +def argmax_split_different_indices( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i + 1] = T.min_value("float32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i + 1] = v_argmax_v1 + + +@T.prim_func +def argmax_split_init_not_bufferstore( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + v1_init: T.float32 = T.min_value("float32") + argmax_v1[i] = v1_init + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + +@T.prim_func +def argmax_split_init_buffer_duplicate( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v0[i] = -1 + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + +@T.prim_func +def argmax_split_letstmt_fewer_than_init( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.min_value("float32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + + +@T.prim_func +def argmax_split_letstmt_more_than_init( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + +@T.prim_func +def argmax_split_let_body_neither_seqstmt_nor_bufferstore( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.min_value("float32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + T.evaluate(0) + + +@T.prim_func +def argmax_split_init_update_inconsistent_bufferstore_number( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.min_value("float32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + argmax_v1[i] = v_argmax_v1 + + +@T.prim_func +def argmax_split_body_seq_not_bufferstore( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.min_value("float32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = v_argmax_v0 + T.evaluate(0) + + +@T.prim_func +def argmax_split_body_bufferstore_value_not_var( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.min_value("float32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + argmax_v1[i] = v_argmax_v1 + + +@T.prim_func +def argmax_split_body_bufferstore_value_unbound_var( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + v_unbound = T.var("int32") + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.min_value("float32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = v_unbound + argmax_v1[i] = v_argmax_v1 + + +@T.prim_func +def argmax_split_one_let_var_used_multi_times( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "int32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "int32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.min_value("int32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v0 + + +@T.prim_func +def argmax_split_body_one_buffer_updated_multi_times( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "int32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "int32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.min_value("int32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = v_argmax_v0 + argmax_v0[i] = v_argmax_v1 + + +@T.prim_func +def argmax_split_init_buffer_not_match( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v0_1: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v0_1[i], argmax_v1[i]) + with T.init(): + argmax_v0_1[i] = -1 + argmax_v1[i] = T.min_value("float32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + +@T.prim_func +def argmax_split_rfactor( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + argmax_v0_rf = T.alloc_buffer([128, 32], dtype="int32") + argmax_v1_rf = T.alloc_buffer([128, 32], dtype="float32") + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmax_rf"): + vi1_1, i, vi1_0 = T.axis.remap("SSR", [i1_1, i0, i1_0]) + T.reads(idx[i, vi1_0 * 32 + vi1_1], val[i, vi1_0 * 32 + vi1_1]) + T.writes(argmax_v0_rf[i, vi1_1], argmax_v1_rf[i, vi1_1]) + with T.init(): + argmax_v0_rf[i, vi1_1] = -1 + argmax_v1_rf[i, vi1_1] = T.min_value("float32") + v_argmax_v0_rf: T.int32 = T.Select( + argmax_v1_rf[i, vi1_1] >= val[i, vi1_0 * 32 + vi1_1], + argmax_v0_rf[i, vi1_1], + idx[i, vi1_0 * 32 + vi1_1], + ) + v_argmax_v1_rf: T.float32 = T.Select( + argmax_v1_rf[i, vi1_1] >= val[i, vi1_0 * 32 + vi1_1], + argmax_v1_rf[i, vi1_1], + val[i, vi1_0 * 32 + vi1_1], + ) + argmax_v0_rf[i, vi1_1] = v_argmax_v0_rf + argmax_v1_rf[i, vi1_1] = v_argmax_v1_rf + for i0, i1_1 in T.grid(128, 32): + with T.block("argmax"): + vi1_1, i = T.axis.remap("RS", [i1_1, i0]) + T.reads(argmax_v0_rf[i, vi1_1], argmax_v1_rf[i, vi1_1]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.min_value("float32") + v_argmax_v0: T.int32 = T.Select( + argmax_v1[i] >= argmax_v1_rf[i, vi1_1], argmax_v0[i], argmax_v0_rf[i, vi1_1] + ) + v_argmax_v1: T.float32 = T.Select( + argmax_v1[i] >= argmax_v1_rf[i, vi1_1], argmax_v1[i], argmax_v1_rf[i, vi1_1] + ) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + +@T.prim_func +def argmin_split_rfactor( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmin_v0: T.Buffer[(128,), "int32"], + argmin_v1: T.Buffer[(128,), "float32"], +) -> None: + argmin_v0_rf = T.alloc_buffer([128, 32], dtype="int32") + argmin_v1_rf = T.alloc_buffer([128, 32], dtype="float32") + for i0, i1_0, i1_1 in T.grid(128, 4, 32): + with T.block("argmin_rf"): + vi1_1, i, vi1_0 = T.axis.remap("SSR", [i1_1, i0, i1_0]) + T.reads(idx[i, vi1_0 * 32 + vi1_1], val[i, vi1_0 * 32 + vi1_1]) + T.writes(argmin_v0_rf[i, vi1_1], argmin_v1_rf[i, vi1_1]) + with T.init(): + argmin_v0_rf[i, vi1_1] = -1 + argmin_v1_rf[i, vi1_1] = T.max_value("float32") + v_argmin_v0_rf: T.int32 = T.Select( + argmin_v1_rf[i, vi1_1] <= val[i, vi1_0 * 32 + vi1_1], + argmin_v0_rf[i, vi1_1], + idx[i, vi1_0 * 32 + vi1_1], + ) + v_argmin_v1_rf: T.float32 = T.Select( + argmin_v1_rf[i, vi1_1] <= val[i, vi1_0 * 32 + vi1_1], + argmin_v1_rf[i, vi1_1], + val[i, vi1_0 * 32 + vi1_1], + ) + argmin_v0_rf[i, vi1_1] = v_argmin_v0_rf + argmin_v1_rf[i, vi1_1] = v_argmin_v1_rf + for i0, i1_1 in T.grid(128, 32): + with T.block("argmin"): + vi1_1, i = T.axis.remap("RS", [i1_1, i0]) + T.reads(argmin_v0_rf[i, vi1_1], argmin_v1_rf[i, vi1_1]) + T.writes(argmin_v0[i], argmin_v1[i]) + with T.init(): + argmin_v0[i] = -1 + argmin_v1[i] = T.max_value("float32") + v_argmin_v0: T.int32 = T.Select( + argmin_v1[i] <= argmin_v1_rf[i, vi1_1], argmin_v0[i], argmin_v0_rf[i, vi1_1] + ) + v_argmin_v1: T.float32 = T.Select( + argmin_v1[i] <= argmin_v1_rf[i, vi1_1], argmin_v1[i], argmin_v1_rf[i, vi1_1] + ) + argmin_v0[i] = v_argmin_v0 + argmin_v1[i] = v_argmin_v1 + + # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg @@ -668,6 +1147,17 @@ def test_reduction_rfactor_matmul(): verify_trace_roundtrip(s, mod=transformed_matmul) +def test_reduction_rfactor_matmul_with_let(): + s = tir.Schedule(transformed_matmul_with_let, debug_mask="all") + update = s.get_block("update") + _, _, _, _, kii = s.get_loops(update) + rf_block = s.rfactor(kii, 0) + tvm.ir.assert_structural_equal(s.mod["main"], matmul_rfactor) + assert s.get(rf_block).same_as(s.get(s.get_block("update_rf"))) + assert s.get(update).same_as(s.get(s.get_block("update"))) + verify_trace_roundtrip(s, mod=transformed_matmul_with_let) + + def test_reduction_rfactor_square_sum(): s = tir.Schedule(square_sum, debug_mask="all") C = s.get_block("C") @@ -773,6 +1263,13 @@ def test_reduction_rfactor_wrong_reduce_pattern2(): s.rfactor(k, 0) +def test_reduction_rfactor_init_not_bufferstore(): + s = tir.Schedule(rowsum_init_not_bufferstore, debug_mask="all") + _, k = s.get_loops(s.get_block("B")) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(k, 0) + + def test_reduction_rfactor_wrong_loops1(): s = tir.Schedule(rowsum, debug_mask="all") i, _ = s.get_loops(s.get_block("B")) @@ -852,10 +1349,146 @@ def test_reduction_rfactor_spatial_only(): s = tir.Schedule(rfactor_spatial_only, debug_mask="all") block = s.get_block(name="acc", func_name="main") _, _, _, _, loop, _ = s.get_loops(block) - s.rfactor(loop=loop, factor_axis=4) + rf_block = s.rfactor(loop=loop, factor_axis=4) tvm.ir.assert_structural_equal(s.mod["main"], rfactor_spatial_only_after) + assert s.get(rf_block).same_as(s.get(s.get_block("acc_rf"))) + assert s.get(block).same_as(s.get(s.get_block("acc"))) verify_trace_roundtrip(s, mod=rfactor_spatial_only) +def test_reduction_rfactor_argmax(): + s = tir.Schedule(argmax_split, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + rf_block = s.rfactor(ki, 1) + tvm.ir.assert_structural_equal(s.mod["main"], argmax_split_rfactor) + assert s.get(rf_block).same_as(s.get(s.get_block("argmax_rf"))) + assert s.get(argmax).same_as(s.get(s.get_block("argmax"))) + verify_trace_roundtrip(s, mod=argmax_split) + + +def test_reduction_rfactor_argmin_init_update_reordeded(): + s = tir.Schedule(argmin_split_init_update_reordered, debug_mask="all") + argmin = s.get_block("argmin") + _, _, ki = s.get_loops(argmin) + rf_block = s.rfactor(ki, 1) + tvm.ir.assert_structural_equal(s.mod["main"], argmin_split_rfactor) + assert s.get(rf_block).same_as(s.get(s.get_block("argmin_rf"))) + assert s.get(argmin).same_as(s.get(s.get_block("argmin"))) + verify_trace_roundtrip(s, mod=argmin_split_init_update_reordered) + + +def test_reduction_rfactor_argmax_reduction_buffer_different_shape(): + s = tir.Schedule(argmax_split_different_shape, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(ki, 1) + + +def test_reduction_rfactor_argmax_different_access_indices(): + s = tir.Schedule(argmax_split_different_indices, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(ki, 1) + + +def test_reduction_rfactor_argmax_init_not_bufferstore(): + s = tir.Schedule(argmax_split_init_not_bufferstore, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(ki, 1) + + +def test_reduction_rfactor_argmax_init_buffer_duplicate(): + s = tir.Schedule(argmax_split_init_buffer_duplicate, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(ki, 1) + + +def test_reduction_rfactor_argmax_letstmt_fewer_than_init(): + s = tir.Schedule(argmax_split_letstmt_fewer_than_init, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(ki, 1) + + +def test_reduction_rfactor_argmax_letstmt_more_than_init(): + s = tir.Schedule(argmax_split_letstmt_more_than_init, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(ki, 1) + + +def test_reduction_rfactor_argmax_let_body_neither_seqstmt_nor_bufferstore(): + s = tir.Schedule(argmax_split_let_body_neither_seqstmt_nor_bufferstore, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(ki, 1) + + +def test_reduction_rfactor_argmax_init_update_inconsistent_bufferstore_number(): + s = tir.Schedule(argmax_split_init_update_inconsistent_bufferstore_number, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(ki, 1) + + +def test_reduction_rfactor_argmax_body_seq_not_bufferstore(): + s = tir.Schedule(argmax_split_body_seq_not_bufferstore, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(ki, 1) + + +def test_reduction_rfactor_argmax_body_bufferstore_value_not_var(): + s = tir.Schedule(argmax_split_body_bufferstore_value_not_var, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(ki, 1) + + +def test_reduction_rfactor_argmax_body_bufferstore_value_unbound_var(): + s = tir.Schedule(argmax_split_body_bufferstore_value_unbound_var, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(ki, 1) + + +def test_reduction_rfactor_argmax_one_let_var_used_multi_times(): + s = tir.Schedule(argmax_split_one_let_var_used_multi_times, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(ki, 1) + + +def test_reduction_rfactor_argmax_body_one_buffer_updated_multi_times(): + s = tir.Schedule(argmax_split_body_one_buffer_updated_multi_times, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(ki, 1) + + +def test_reduction_rfactor_argmax_init_buffer_not_match(): + s = tir.Schedule(argmax_split_init_buffer_not_match, debug_mask="all") + argmax = s.get_block("argmax") + _, _, ki = s.get_loops(argmax) + with pytest.raises(tvm.tir.ScheduleError): + s.rfactor(ki, 1) + + if __name__ == "__main__": tvm.testing.main() diff --git a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py index 9b5937ac6efd..ff1353d2265e 100644 --- a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py +++ b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# pylint: disable=missing-function-docstring,missing-module-docstring import sys import pytest @@ -22,6 +23,8 @@ from tvm import te from tvm.script import tir as T +# pylint: disable=no-member,invalid-name,unused-variable,unexpected-keyword-arg + def _check(original, transformed): mod = tvm.IRModule.from_expr(original) @@ -44,7 +47,7 @@ def loop_split(a: T.handle, b: T.handle) -> None: with T.block("B"): vi = T.axis.S(128, i) vk = T.axis.R(128, ko * 32 + ki) - T.reads([B[vi], A[vi, vk]]) + T.reads([A[vi, vk]]) T.writes([B[vi]]) with T.init(): B[vi] = T.float32(0) @@ -67,7 +70,7 @@ def lowered_loop_split(a: T.handle, b: T.handle) -> None: with T.block("B_normal_reduction"): vi = T.axis.S(128, i) vk = T.axis.R(128, ko * 32 + ki) - T.reads([A[vi, vk], normal_reduce_temp0[0]]) + T.reads([A[vi, vk]]) T.writes([normal_reduce_temp0[0]]) normal_reduce_temp0[0] = normal_reduce_temp0[0] + A[vi, vk] with T.block("B_cross_thread_reduction"): @@ -103,7 +106,7 @@ def no_normal_reduction(a: T.handle, b: T.handle) -> None: for k in T.thread_binding(0, 128, thread="threadIdx.x"): with T.block("B"): vi, vk = T.axis.remap("SR", [i, k]) - T.reads([B[vi], A[vi, vk]]) + T.reads([A[vi, vk]]) T.writes([B[vi]]) with T.init(): B[vi] = T.float32(0) @@ -148,7 +151,7 @@ def two_bound_loops(a: T.handle, b: T.handle) -> None: with T.block("B"): vi = T.axis.spatial(128, i) vk = T.axis.reduce(128, ko * 32 + ki) - T.reads([B[vi], A[vi, vk]]) + T.reads([A[vi, vk]]) T.writes([B[vi]]) with T.init(): B[vi] = T.float32(0) @@ -196,7 +199,7 @@ def multiple_blocks_under_reduction_loop(a: T.handle, b: T.handle) -> None: with T.block("B_rf"): vk0 = T.axis.spatial(16, k0o * 4 + k0i0) vi, vk1 = T.axis.remap("SR", [i, k1]) - T.reads([B_rf_local[vk0, vi], A[vi, vk0, vk1]]) + T.reads([A[vi, vk0, vk1]]) T.writes([B_rf_local[vk0, vi]]) with T.init(): B_rf_local[vk0, vi] = T.float32(0) @@ -205,7 +208,7 @@ def multiple_blocks_under_reduction_loop(a: T.handle, b: T.handle) -> None: with T.block("B"): vk0 = T.axis.reduce(16, k0o * 4 + k0i1) vi = T.axis.spatial(16, i) - T.reads([B[vi], B_rf_local[vk0, vi]]) + T.reads([B_rf_local[vk0, vi]]) T.writes([B[vi]]) with T.init(): B[vi] = T.float32(0) @@ -229,7 +232,7 @@ def lowered_multiple_blocks_under_reduction_loop(a: T.handle, b: T.handle) -> No with T.block("B_rf"): vk0 = T.axis.spatial(16, k0o * 4 + k0i0) vi, vk1 = T.axis.remap("SR", [i, k1]) - T.reads([B_rf_local[vk0, vi], A[vi, vk0, vk1]]) + T.reads([A[vi, vk0, vk1]]) T.writes([B_rf_local[vk0, vi]]) with T.init(): B_rf_local[vk0, vi] = T.float32(0) @@ -238,7 +241,7 @@ def lowered_multiple_blocks_under_reduction_loop(a: T.handle, b: T.handle) -> No with T.block("B_normal_reduction"): vk0 = T.axis.reduce(16, k0o * 4 + k0i1) vi = T.axis.spatial(16, i) - T.reads([B_rf_local[vk0, vi], normal_reduce_temp0[0]]) + T.reads([B_rf_local[vk0, vi]]) T.writes([normal_reduce_temp0[0]]) normal_reduce_temp0[0] = normal_reduce_temp0[0] + B_rf_local[vk0, vi] with T.block("B_cross_thread_reduction"): @@ -276,7 +279,7 @@ def with_block_predicate(a: T.handle, b: T.handle) -> None: vi = T.axis.spatial(128, i) vk = T.axis.reduce(120, ko * 32 + ki) T.where(ko * 32 + ki < 120) - T.reads([B[vi], A[vi, vk]]) + T.reads([A[vi, vk]]) T.writes([B[vi]]) with T.init(): B[vi] = T.float32(0) @@ -300,7 +303,7 @@ def lowered_with_block_predicate(a: T.handle, b: T.handle) -> None: vi = T.axis.spatial(128, i) vk = T.axis.reduce(120, ko * 32 + ki) T.where(ko * 32 + ki < 120) - T.reads([A[vi, vk], normal_reduce_temp0[0]]) + T.reads([A[vi, vk]]) T.writes([normal_reduce_temp0[0]]) normal_reduce_temp0[0] = normal_reduce_temp0[0] + A[vi, vk] with T.block("B_cross_thread_reduction"): @@ -341,7 +344,7 @@ def single_reduction_loop_with_block_predicate( i0_1 = T.axis.spatial(256, i0) k = T.axis.reduce(256, ax1_1) T.where(ax1_0 * 512 + ax1_1 < 256) - T.reads(T_softmax_maxelem_shared[i0_1], A[i0_1, k]) + T.reads(A[i0_1, k]) T.writes(T_softmax_maxelem_shared[i0_1]) with T.init(): T_softmax_maxelem_shared[i0_1] = T.float32(-3.4028234663852886e38) @@ -354,9 +357,7 @@ def single_reduction_loop_with_block_predicate( i0_2 = T.axis.spatial(256, i0) k = T.axis.reduce(256, ax1_1) T.where(ax1_0 * 512 + ax1_1 < 256) - T.reads( - T_softmax_expsum_shared[i0_2], A[i0_2, k], T_softmax_maxelem_shared[i0_2] - ) + T.reads(A[i0_2, k], T_softmax_maxelem_shared[i0_2]) T.writes(T_softmax_expsum_shared[i0_2]) with T.init(): T_softmax_expsum_shared[i0_2] = T.float32(0) @@ -401,7 +402,7 @@ def lowered_single_reduction_loop_with_block_predicate( i0_1 = T.axis.spatial(256, i0) k = T.axis.reduce(256, ax1_1) T.where(ax1_0 * 512 + ax1_1 < 256) - T.reads(A[i0_1, k], in_thread_0[0]) + T.reads(A[i0_1, k]) T.writes(in_thread_0[0]) in_thread_0[0] = T.max(in_thread_0[0], A[i0_1, k]) with T.block("T_softmax_maxelem_cross_thread"): @@ -439,7 +440,7 @@ def lowered_single_reduction_loop_with_block_predicate( i0_3 = T.axis.spatial(256, i0) k = T.axis.reduce(256, ax1_1) T.where(ax1_0 * 512 + ax1_1 < 256) - T.reads(A[i0_3, k], T_softmax_maxelem_shared[i0_3], in_thread_1[0]) + T.reads(A[i0_3, k], T_softmax_maxelem_shared[i0_3]) T.writes(in_thread_1[0]) in_thread_1[0] = in_thread_1[0] + T.exp( A[i0_3, k] - T_softmax_maxelem_shared[i0_3], dtype="float32" @@ -492,7 +493,7 @@ def reducer_max(a: T.handle, b: T.handle) -> None: for k in T.thread_binding(0, 128, thread="threadIdx.x"): with T.block("B"): vi, vk = T.axis.remap("SR", [i, k]) - T.reads([B[vi], A[vi, vk]]) + T.reads([A[vi, vk]]) T.writes([B[vi]]) with T.init(): B[vi] = T.min_value("float32") @@ -534,7 +535,7 @@ def zero_rank_buffer(a: T.handle, b: T.handle) -> None: for k in T.thread_binding(0, 128, thread="threadIdx.x"): with T.block("B"): vk = T.axis.reduce(128, k) - T.reads([B[()], A[vk]]) + T.reads([A[vk]]) T.writes([B[()]]) with T.init(): B[()] = T.float32(0) @@ -590,7 +591,7 @@ def reduction_loop_not_deepest(a: T.handle, b: T.handle) -> None: for i in T.serial(0, 128): with T.block("B"): vi, vk = T.axis.remap("SR", [i, k]) - T.reads([B[vi], A[vi, vk]]) + T.reads([A[vi, vk]]) T.writes([B[vi]]) with T.init(): B[vi] = T.float32(0) @@ -605,7 +606,7 @@ def reduction_loop_bound_to_blockidx(a: T.handle, b: T.handle) -> None: for k in T.thread_binding(0, 128, thread="blockIdx.x"): with T.block("B"): vi, vk = T.axis.remap("SR", [i, k]) - T.reads([B[vi], A[vi, vk]]) + T.reads([A[vi, vk]]) T.writes([B[vi]]) with T.init(): B[vi] = T.float32(0) @@ -620,7 +621,7 @@ def different_access_indices(a: T.handle, b: T.handle) -> None: for k in T.thread_binding(0, 128, thread="threadIdx.x"): with T.block("B"): vi, vj, vk = T.axis.remap("SSR", [i, j, k]) - T.reads([B[vi, vj], A[vi, vj, vk]]) + T.reads([A[vi, vj, vk]]) T.writes( [ B[ @@ -642,7 +643,7 @@ def invalid_reducer(a: T.handle, b: T.handle) -> None: for k in T.thread_binding(0, 128, thread="threadIdx.x"): with T.block("B"): vi, vk = T.axis.remap("SR", [i, k]) - T.reads([B[vi], A[vi, vk]]) + T.reads([A[vi, vk]]) T.writes([B[vi]]) with T.init(): B[vi] = T.float32(0) @@ -661,7 +662,7 @@ def softmax(var_A: T.handle, var_T_softmax_norm: T.handle) -> None: with T.block("T_softmax_maxelem"): i0_1 = T.axis.spatial(256, i0) k = T.axis.reduce(256, ax0_0 * 32 + ax0_1) - T.reads([T_softmax_maxelem_shared[i0_1], A[i0_1, k]]) + T.reads([A[i0_1, k]]) T.writes([T_softmax_maxelem_shared[i0_1]]) with T.init(): T_softmax_maxelem_shared[i0_1] = T.min_value("float32") @@ -675,7 +676,6 @@ def softmax(var_A: T.handle, var_T_softmax_norm: T.handle) -> None: k = T.axis.reduce(256, ax0_0 * 32 + ax0_1) T.reads( [ - T_softmax_expsum_shared[i0_2], A[i0_2, k], T_softmax_maxelem_shared[i0_2], ] @@ -729,7 +729,7 @@ def lowered_softmax(var_A: T.handle, var_T_softmax_norm: T.handle) -> None: with T.block("T_softmax_maxelem_normal_reduction"): i0_1 = T.axis.spatial(256, i0) k = T.axis.reduce(256, ax0_0 * 32 + ax0_1) - T.reads([A[i0_1, k], normal_reduce_temp0[0]]) + T.reads([A[i0_1, k]]) T.writes([normal_reduce_temp0[0]]) normal_reduce_temp0[0] = T.max(normal_reduce_temp0[0], A[i0_1, k]) with T.block("T_softmax_maxelem_cross_thread_reduction"): @@ -768,7 +768,6 @@ def lowered_softmax(var_A: T.handle, var_T_softmax_norm: T.handle) -> None: [ A[i0_3, k], T_softmax_maxelem_shared[i0_3], - normal_reduce_temp1[0], ] ) T.writes([normal_reduce_temp1[0]]) @@ -821,6 +820,191 @@ def lowered_softmax(var_A: T.handle, var_T_softmax_norm: T.handle) -> None: ) +@T.prim_func +def argmax_split( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0 in T.grid(128, 4): + for i1_1 in T.thread_binding(32, thread="threadIdx.x"): + with T.block("argmax"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.float32(-3.4028234663852886e38) + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select( + argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k] + ) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + +@T.prim_func +def lowered_argmax_split( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmax_v0: T.Buffer[(128,), "int32"], + argmax_v1: T.Buffer[(128,), "float32"], +) -> None: + cross_thread_argmax_v0 = T.alloc_buffer([1], dtype="int32", strides=[1], scope="local") + cross_thread_argmax_v1 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local") + in_thread_argmax_v0 = T.alloc_buffer([1], dtype="int32", strides=[1], scope="local") + in_thread_argmax_v1 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local") + for i0 in T.serial(128): + for i1_1 in T.thread_binding(32, thread="threadIdx.x"): + with T.block("argmax_in_thread_init"): + T.reads() + T.writes(in_thread_argmax_v0[0], in_thread_argmax_v1[0]) + in_thread_argmax_v0[0] = -1 + in_thread_argmax_v1[0] = T.float32(-3.4028234663852886e38) + for i1_0 in T.serial(4): + with T.block("argmax_in_thread"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(in_thread_argmax_v0[0], in_thread_argmax_v1[0]) + v_argmax_v0: T.int32 = T.Select( + in_thread_argmax_v1[0] >= val[i, k], in_thread_argmax_v0[0], idx[i, k] + ) + v_argmax_v1: T.float32 = T.Select( + in_thread_argmax_v1[0] >= val[i, k], in_thread_argmax_v1[0], val[i, k] + ) + in_thread_argmax_v0[0] = v_argmax_v0 + in_thread_argmax_v1[0] = v_argmax_v1 + with T.block("argmax_cross_thread"): + T.reads(in_thread_argmax_v0[0], in_thread_argmax_v1[0]) + T.writes(cross_thread_argmax_v0[0], cross_thread_argmax_v1[0]) + T.attr( + T.comm_reducer( + lambda x0, x1, y0, y1: ( + T.Select(x1 >= y1, x0, y0), + T.Select(x1 >= y1, x1, y1), + ), + [-1, T.float32(-3.4028234663852886e38)], + ), + "reduce_scope", + T.reinterpret(T.uint64(0), dtype="handle"), + ) + T.evaluate( + T.tvm_thread_allreduce( + T.uint32(2), + in_thread_argmax_v0[0], + in_thread_argmax_v1[0], + True, + cross_thread_argmax_v0[0], + cross_thread_argmax_v1[0], + i1_1, + dtype="handle", + ) + ) + with T.block("argmax_write_back"): + i = T.axis.spatial(128, i0) + T.reads(cross_thread_argmax_v0[0], cross_thread_argmax_v1[0]) + T.writes(argmax_v0[i], argmax_v1[i]) + argmax_v0[i] = cross_thread_argmax_v0[0] + argmax_v1[i] = cross_thread_argmax_v1[0] + + +@T.prim_func +def argmin_split_init_update_reordered( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmin_v0: T.Buffer[(128,), "int32"], + argmin_v1: T.Buffer[(128,), "float32"], +) -> None: + for i0, i1_0 in T.grid(128, 4): + for i1_1 in T.thread_binding(32, thread="threadIdx.x"): + with T.block("argmin"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmin_v0[i], argmin_v1[i]) + with T.init(): + argmin_v1[i] = T.float32(3.4028234663852886e38) + argmin_v0[i] = -1 + v_argmin_v0: T.int32 = T.Select(argmin_v1[i] <= val[i, k], argmin_v0[i], idx[i, k]) + v_argmin_v1: T.float32 = T.Select( + argmin_v1[i] <= val[i, k], argmin_v1[i], val[i, k] + ) + argmin_v1[i] = v_argmin_v1 + argmin_v0[i] = v_argmin_v0 + + +@T.prim_func +def lowered_argmin_split_init_update_reordered( + idx: T.Buffer[(128, 128), "int32"], + val: T.Buffer[(128, 128), "float32"], + argmin_v0: T.Buffer[(128,), "int32"], + argmin_v1: T.Buffer[(128,), "float32"], +) -> None: + cross_thread_argmin_v0 = T.alloc_buffer([1], dtype="int32", strides=[1], scope="local") + cross_thread_argmin_v1 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local") + in_thread_argmin_v0 = T.alloc_buffer([1], dtype="int32", strides=[1], scope="local") + in_thread_argmin_v1 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local") + for i0 in T.serial(128): + for i1_1 in T.thread_binding(32, thread="threadIdx.x"): + with T.block("argmin_in_thread_init"): + T.reads() + T.writes(in_thread_argmin_v0[0], in_thread_argmin_v1[0]) + in_thread_argmin_v0[0] = -1 + in_thread_argmin_v1[0] = T.float32(3.4028234663852886e38) + for i1_0 in T.serial(4): + with T.block("argmin_in_thread"): + i = T.axis.spatial(128, i0) + k = T.axis.reduce(128, i1_0 * 32 + i1_1) + T.reads(idx[i, k], val[i, k]) + T.writes(in_thread_argmin_v0[0], in_thread_argmin_v1[0]) + v_argmin_v0: T.int32 = T.Select( + in_thread_argmin_v1[0] <= val[i, k], in_thread_argmin_v0[0], idx[i, k] + ) + v_argmin_v1: T.float32 = T.Select( + in_thread_argmin_v1[0] <= val[i, k], in_thread_argmin_v1[0], val[i, k] + ) + in_thread_argmin_v1[0] = v_argmin_v1 + in_thread_argmin_v0[0] = v_argmin_v0 + with T.block("argmin_cross_thread"): + T.reads(in_thread_argmin_v0[0], in_thread_argmin_v1[0]) + T.writes(cross_thread_argmin_v0[0], cross_thread_argmin_v1[0]) + T.attr( + T.comm_reducer( + lambda x0, x1, y0, y1: ( + T.Select(x1 <= y1, x0, y0), + T.Select(x1 <= y1, x1, y1), + ), + [-1, T.float32(3.4028234663852886e38)], + ), + "reduce_scope", + T.reinterpret(T.uint64(0), dtype="handle"), + ) + T.evaluate( + T.tvm_thread_allreduce( + T.uint32(2), + in_thread_argmin_v0[0], + in_thread_argmin_v1[0], + True, + cross_thread_argmin_v0[0], + cross_thread_argmin_v1[0], + i1_1, + dtype="handle", + ) + ) + with T.block("argmin_write_back"): + i = T.axis.spatial(128, i0) + T.reads(cross_thread_argmin_v0[0], cross_thread_argmin_v1[0]) + T.writes(argmin_v0[i], argmin_v1[i]) + argmin_v0[i] = cross_thread_argmin_v0[0] + argmin_v1[i] = cross_thread_argmin_v1[0] + + +# pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg + + def test_loop_split(): _check(loop_split, lowered_loop_split) @@ -880,6 +1064,14 @@ def test_softmax(): _check(softmax, lowered_softmax) +def test_argmax_split(): + _check(argmax_split, lowered_argmax_split) + + +def test_argmin_split_init_update_reordered(): + _check(argmin_split_init_update_reordered, lowered_argmin_split_init_update_reordered) + + def test_lower_te(): a = te.placeholder((32, 2, 2)) k1 = te.reduce_axis((0, 2), "k1") From 296565aaf985adbc33ede565e9b167987138ddfc Mon Sep 17 00:00:00 2001 From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com> Date: Wed, 14 Sep 2022 23:06:05 +0100 Subject: [PATCH 167/704] Fixed pylint issues after moving to venv in ci_lint docker (#12775) Following change introduced installing python dependencies inside virtual environments: https://github.com/apache/tvm/pull/12663 Previous to this fix, a different version of python was being picked up that didn't catch the issues fixed in this commit. Change-Id: Ie290d9474a799311e07d293fa1b8299326b11661 --- python/tvm/relay/testing/darknet.py | 2 +- tests/python/frontend/darknet/test_forward.py | 2 +- tests/python/frontend/tensorflow/test_forward.py | 2 +- tests/python/frontend/tflite/test_forward.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/tvm/relay/testing/darknet.py b/python/tvm/relay/testing/darknet.py index e1345043c6bb..b1f364273e1b 100644 --- a/python/tvm/relay/testing/darknet.py +++ b/python/tvm/relay/testing/darknet.py @@ -23,9 +23,9 @@ These are utility functions used for testing and tutorial file. """ from __future__ import division +from cffi import FFI import numpy as np import cv2 -from cffi import FFI def convert_image(image): diff --git a/tests/python/frontend/darknet/test_forward.py b/tests/python/frontend/darknet/test_forward.py index ffaa773fc1bd..5e6af51f3298 100644 --- a/tests/python/frontend/darknet/test_forward.py +++ b/tests/python/frontend/darknet/test_forward.py @@ -22,6 +22,7 @@ All the required models and libraries will be downloaded from the internet by the script. """ +from cffi import FFI import numpy as np import tvm from tvm.contrib import graph_executor @@ -31,7 +32,6 @@ from tvm.relay.testing.darknet import __darknetffi__ from tvm.relay.frontend.darknet import ACTIVATION from tvm import relay -from cffi import FFI REPO_URL = "https://github.com/dmlc/web-data/blob/main/darknet/" DARKNET_LIB = "libdarknet2.0.so" diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py index 8ed6d9108e5d..f3195f05d40f 100755 --- a/tests/python/frontend/tensorflow/test_forward.py +++ b/tests/python/frontend/tensorflow/test_forward.py @@ -26,11 +26,11 @@ import threading import platform import os.path +from packaging import version as package_version import numpy as np import pytest from PIL import Image -from packaging import version as package_version from tvm import relay from tvm.runtime.vm import VirtualMachine from tvm.relay.frontend.tensorflow import from_tensorflow diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py index 18045b8e8365..deaef72e1d7f 100644 --- a/tests/python/frontend/tflite/test_forward.py +++ b/tests/python/frontend/tflite/test_forward.py @@ -26,11 +26,11 @@ import os import tempfile +from packaging import version as package_version import pytest import numpy as np from PIL import Image -from packaging import version as package_version import tvm import tvm.relay.testing.tf as tf_testing From e5adb83d8e1cd3f5a9fe10946fb7b5b60bf54b94 Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Wed, 14 Sep 2022 20:08:32 -0300 Subject: [PATCH 168/704] [microTVM][Zephyr] Fix PLL freq. in overlay for nucleo_l4r5zi board (#12756) * [microTVM][Zephyr] Fix PLL freq. in overlay for nucleo_l4r5zi board Commit 1d32c400f ("Add project overlay to overwrite device tree configs") added overlay for setting 'clock-frequency' property of node 'rcc' to 120 MHz, however to effectively change the PLL frequency that drivers the core it's necessary also to overlay the attributes for the 'pll' node. This commit does that. Signed-off-by: Gustavo Romero * Remove div-p and div-q properties from overlay Remove div-p and div-q properties from the overlay file since values for these properties will be inherited from the 'pll' that is overlaid. Since currently microTVM does not use any subsystem which relies on clocks associated to either P or Q params, these params can be left unchanged for now. Signed-off-by: Gustavo Romero --- .../app-overlay/nucleo_l4r5zi.overlay | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay b/apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay index 360e0753d4f5..532efe50d397 100644 --- a/apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay +++ b/apps/microtvm/zephyr/template_project/app-overlay/nucleo_l4r5zi.overlay @@ -21,3 +21,25 @@ &rcc { clock-frequency = ; }; + +/* + Set PLL accordingly to freq. reported by 'clock-frequency' property, where: + + VCO freq = PLL clock input freq (HSI: 16 MHz) * N / M and + Core freq = VCO freq / R. + + Hence: + + VCO freq = 16 * 30 / 2 = 240 MHz and + Core freq = 240 MHz / 2 = 120 MHz + + Prop. 'div-p' and 'div-q' will be inherited from the overlaid 'pll' node. +*/ + +&pll { + div-m = <2>; + mul-n = <30>; + div-r = <2>; + clocks = <&clk_hsi>; + status = "okay"; +}; From 397cf8781eba7a2bcc35e832130801c1d1419c43 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Thu, 15 Sep 2022 06:39:20 -0500 Subject: [PATCH 169/704] [Arith][Refactor] Return Optional from TryConstFold (#12784) Prior to this commit, the templated `TryConstFold` utility returned an undefined `PrimExpr` to represent a failure to perform constant folding. This commit makes this explicit by returning `Optional` instead. --- src/arith/canonical_simplify.cc | 21 +++----- src/arith/const_fold.h | 91 +++++++++++++++++---------------- src/arith/int_set.cc | 10 ++-- src/arith/iter_affine_map.cc | 15 ++---- src/arith/pattern_match.h | 3 +- src/arith/rewrite_simplify.cc | 42 +++++---------- src/tir/op/op.cc | 57 +++++++-------------- 7 files changed, 99 insertions(+), 140 deletions(-) diff --git a/src/arith/canonical_simplify.cc b/src/arith/canonical_simplify.cc index 9f45317cba11..f5d2667aa64e 100644 --- a/src/arith/canonical_simplify.cc +++ b/src/arith/canonical_simplify.cc @@ -716,8 +716,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const AddNode* op) { PrimExpr b = this->CanonicalMutate(op->b); // const folding - PrimExpr const_res = TryConstFold(a, b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(a, b)) return const_res.value(); // canonical form simplification. SumExpr ret = ToSumExpr(std::move(a)); @@ -741,8 +740,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const SubNode* op) { PrimExpr b = this->CanonicalMutate(op->b); // const folding - PrimExpr const_res = TryConstFold(a, b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(a, b)) return const_res.value(); // canonical form simplification. SumExpr ret = ToSumExpr(std::move(a)); @@ -766,8 +764,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const MulNode* op) { PrimExpr b = this->CanonicalMutate(op->b); // const folding - PrimExpr const_res = TryConstFold(a, b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(a, b)) return const_res.value(); // x * c if (a.as()) { @@ -870,8 +867,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const DivNode* op) { PrimExpr b = this->CanonicalMutate(op->b); // const folding - PrimExpr const_res = TryConstFold
(a, b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold
(a, b)) return const_res.value(); PVar c1; // x / c1 if (c1.Match(b) && c1.Eval()->value > 0) { @@ -928,8 +924,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorDivNode* op) { PrimExpr b = this->CanonicalMutate(op->b); // const folding - PrimExpr const_res = TryConstFold(a, b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(a, b)) return const_res.value(); PVar c1; // x / c1 if (c1.Match(b) && c1.Eval()->value > 0) { @@ -1037,8 +1032,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ModNode* op) { PrimExpr b = this->CanonicalMutate(op->b); // const folding - PrimExpr const_res = TryConstFold(a, b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(a, b)) return const_res.value(); PVar c1; // x % c1 @@ -1105,8 +1099,7 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const FloorModNode* op) { PrimExpr b = this->CanonicalMutate(op->b); // const folding - PrimExpr const_res = TryConstFold(a, b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(a, b)) return const_res.value(); PVar c1; // x % c1 diff --git a/src/arith/const_fold.h b/src/arith/const_fold.h index d0e09a1a7429..a7466cf38c85 100644 --- a/src/arith/const_fold.h +++ b/src/arith/const_fold.h @@ -24,6 +24,7 @@ #ifndef TVM_ARITH_CONST_FOLD_H_ #define TVM_ARITH_CONST_FOLD_H_ +#include #include #include @@ -44,10 +45,10 @@ namespace arith { * \tparam Op The operator type. * * \note a and b Must already matched data types with each other. - * \return nullptr if constant fold fails, otherwise return folded result. + * \return NullOpt if constant fold fails, otherwise return folded result. */ template -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b); +inline Optional TryConstFold(PrimExpr a, PrimExpr b); /*! * \brief Try to run unary compute with constant folding. @@ -56,10 +57,10 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b); * \tparam Op The operator type. * * \note a and b Must already matched data types with each other. - * \return nullptr if constant fold fails, otherwise return folded result. + * \return NullOpt if constant fold fails, otherwise return folded result. */ template -inline PrimExpr TryConstFold(PrimExpr a); +inline Optional TryConstFold(PrimExpr a); /*! * \brief Check whether type is used to represent index. @@ -126,7 +127,7 @@ inline double GetFoldResultDoubleRepr(float x) { // specialization of constant folders. template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); if (pa && pb) { @@ -142,17 +143,17 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { } else if (rtype.bits() == 64) { return FloatImm(rtype, fa->value + fb->value); } else { - return PrimExpr(); + return NullOpt; } } if (fa && fa->value == 0) return b; if (fb && fb->value == 0) return a; }); - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ ICHECK(!((pa && pa->dtype.is_uint() && pa->value == 0U) && (pb && pb->dtype.is_uint() && pb->value > 0U))) @@ -171,16 +172,16 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { } else if (rtype.bits() == 64) { return FloatImm(rtype, fa->value - fb->value); } else { - return PrimExpr(); + return NullOpt; } } if (fb && fb->value == 0) return a; }); - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); if (pa && pb) { @@ -202,7 +203,7 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { } else if (rtype.bits() == 64) { return FloatImm(rtype, fa->value * fb->value); } else { - return PrimExpr(); + return NullOpt; } } if (fa) { @@ -214,11 +215,11 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { if (fb->value == 0) return b; } }); - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); if (pa && pb) { @@ -242,7 +243,7 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { } else if (rtype.bits() == 64) { return FloatImm(rtype, fa->value / fb->value); } else { - return PrimExpr(); + return NullOpt; } } if (fa && fa->value == 0) return a; @@ -251,11 +252,11 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { ICHECK_NE(fb->value, 0) << "Divide by zero"; } }); - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_INDEX_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); if (pa && pb) { @@ -271,11 +272,11 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { ICHECK_NE(pb->value, 0) << "Divide by zero"; } }); - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); if (pa && pb) { @@ -297,7 +298,7 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { } else if (rtype.bits() == 64) { return FloatImm(rtype, std::floor(fa->value / fb->value)); } else { - return PrimExpr(); + return NullOpt; } } if (fa && fa->value == 0) return a; @@ -306,11 +307,11 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { ICHECK_NE(fb->value, 0) << "Divide by zero"; } }); - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_INDEX_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); if (pa && pb) { @@ -326,114 +327,114 @@ inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { ICHECK_NE(pb->value, 0) << "Divide by zero"; } }); - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); if (pa && pb) return IntImm(rtype, std::min(pa->value, pb->value)); if (fa && fb) return FloatImm(rtype, std::min(fa->value, fb->value)); }); if (a.same_as(b)) return a; - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ const DataType& rtype = a.dtype(); if (pa && pb) return IntImm(rtype, std::max(pa->value, pb->value)); if (fa && fb) return FloatImm(rtype, std::max(fa->value, fb->value)); }); if (a.same_as(b)) return a; - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ if (pa && pb) return IntImm(DataType::UInt(1), pa->value > pb->value); if (fa && fb) return IntImm(DataType::UInt(1), fa->value > fb->value); }); - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ if (pa && pb) return IntImm(DataType::UInt(1), pa->value >= pb->value); if (fa && fb) return IntImm(DataType::UInt(1), fa->value >= fb->value); }); - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ if (pa && pb) return IntImm(DataType::UInt(1), pa->value < pb->value); if (fa && fb) return IntImm(DataType::UInt(1), fa->value < fb->value); }); - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ if (pa && pb) return IntImm(DataType::UInt(1), pa->value <= pb->value); if (fa && fb) return IntImm(DataType::UInt(1), fa->value <= fb->value); }); - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ if (pa && pb) return IntImm(DataType::UInt(1), pa->value == pb->value); if (fa && fb) return IntImm(DataType::UInt(1), fa->value == fb->value); }); - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { TVM_ARITH_CONST_PROPAGATION({ if (pa && pb) return IntImm(DataType::UInt(1), pa->value != pb->value); if (fa && fb) return IntImm(DataType::UInt(1), fa->value != fb->value); }); - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { const IntImmNode* pa = a.as(); const IntImmNode* pb = b.as(); if (pa && pa->value) return b; if (pa && !pa->value) return a; if (pb && pb->value) return a; if (pb && !pb->value) return b; - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a, PrimExpr b) { +inline Optional TryConstFold(PrimExpr a, PrimExpr b) { const IntImmNode* pa = a.as(); const IntImmNode* pb = b.as(); if (pa && pa->value) return a; if (pa && !pa->value) return b; if (pb && pb->value) return b; if (pb && !pb->value) return a; - return PrimExpr(); + return NullOpt; } template <> -inline PrimExpr TryConstFold(PrimExpr a) { +inline Optional TryConstFold(PrimExpr a) { const IntImmNode* pa = a.as(); if (pa) { return IntImm(DataType::UInt(1), !(pa->value)); } - return PrimExpr(); + return NullOpt; } /*! \brief Helper namespace for symbolic value limits */ diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc index e8e223ceca09..35b12bb35238 100644 --- a/src/arith/int_set.cc +++ b/src/arith/int_set.cc @@ -108,9 +108,13 @@ TVM_DECLARE_LOGICAL_OP(Not); template inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b, DataType dtype) { if (a->IsSinglePoint() && b->IsSinglePoint()) { - PrimExpr res = TryConstFold(a->min_value, b->min_value); - if (!res.defined()) res = Op(a->min_value, b->min_value); - return IntervalSet::SinglePoint(res); + PrimExpr expr; + if (auto res = TryConstFold(a->min_value, b->min_value)) { + expr = res.value(); + } else { + expr = Op(a->min_value, b->min_value); + } + return IntervalSet::SinglePoint(expr); } if (is_logical_op::value) { return IntervalSet(make_const(dtype, 0), make_const(dtype, 1)); diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc index 83e2821c9800..182eada24d96 100644 --- a/src/arith/iter_affine_map.cc +++ b/src/arith/iter_affine_map.cc @@ -1205,8 +1205,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const AddNode* op) { PrimExpr b = this->DirectMutate(op->b); // const folding - PrimExpr const_res = TryConstFold(a, b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(a, b)) return const_res.value(); // does not contain iter map. if (!a->IsInstance() && !b->IsInstance()) { if (op->a.same_as(a) && op->b.same_as(b)) { @@ -1240,8 +1239,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const SubNode* op) { PrimExpr b = this->DirectMutate(op->b); // const folding - PrimExpr const_res = TryConstFold(a, b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(a, b)) return const_res.value(); // does not contain iter map. if (!a->IsInstance() && !b->IsInstance()) { @@ -1276,8 +1274,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) { PrimExpr b = this->DirectMutate(op->b); // const folding - PrimExpr const_res = TryConstFold(a, b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(a, b)) return const_res.value(); // does not contain iter map. if (!a->IsInstance() && !b->IsInstance()) { @@ -1572,8 +1569,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) { PrimExpr b = this->DirectMutate(op->b); // const folding - PrimExpr const_res = TryConstFold(a, b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(a, b)) return const_res.value(); // does not contain iter map. if (!a->IsInstance() && !b->IsInstance()) { @@ -1657,8 +1653,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) { PrimExpr b = this->DirectMutate(op->b); // const folding - PrimExpr const_res = TryConstFold(a, b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(a, b)) return const_res.value(); // does not contain iter map. if (!a->IsInstance() && !b->IsInstance()) { diff --git a/src/arith/pattern_match.h b/src/arith/pattern_match.h index 6abcc728fc8d..69f064e11931 100644 --- a/src/arith/pattern_match.h +++ b/src/arith/pattern_match.h @@ -330,8 +330,7 @@ class PBinaryExpr : public Pattern> { PrimExpr Eval() const { PrimExpr lhs = a_.Eval(); PrimExpr rhs = b_.Eval(); - PrimExpr ret = TryConstFold(lhs, rhs); - if (ret.defined()) return ret; + if (auto ret = TryConstFold(lhs, rhs)) return ret.value(); return OpType(lhs, rhs); } diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc index d7866fc1307b..e3e9db62d0bd 100644 --- a/src/arith/rewrite_simplify.cc +++ b/src/arith/rewrite_simplify.cc @@ -124,8 +124,7 @@ void RewriteSimplifier::Impl::Update(const Var& var, const PrimExpr& info, bool PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AddNode* op) { PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op); op = ret.as(); - PrimExpr const_res = TryConstFold(op->a, op->b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(op->a, op->b)) return const_res.value(); // Pattern var to match any expression PVar x, y, z, b1, b2, s1, s2; // Pattern var match IntImm @@ -258,8 +257,7 @@ std::function RewriteSimplifier::Impl::EnterConstraint(const PrimExpr& c PrimExpr RewriteSimplifier::Impl::VisitExpr_(const SubNode* op) { PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op); op = ret.as(); - PrimExpr const_res = TryConstFold(op->a, op->b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(op->a, op->b)) return const_res.value(); // Pattern var to match any expression PVar x, y, z, b1, b2, s1, s2; // Pattern var match IntImm @@ -450,8 +448,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const SubNode* op) { PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MulNode* op) { PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op); op = ret.as(); - PrimExpr const_res = TryConstFold(op->a, op->b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(op->a, op->b)) return const_res.value(); // Pattern var to match any expression PVar x, y, z, b1, b2, s1, s2; // Pattern var match IntImm @@ -490,8 +487,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MulNode* op) { PrimExpr RewriteSimplifier::Impl::VisitExpr_(const DivNode* op) { PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op); op = ret.as(); - PrimExpr const_res = TryConstFold
(op->a, op->b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold
(op->a, op->b)) return const_res.value(); // Pattern var to match any expression PVar x, y, z, b1; // Pattern var match IntImm @@ -666,8 +662,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const DivNode* op) { PrimExpr RewriteSimplifier::Impl::VisitExpr_(const ModNode* op) { PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op); op = ret.as(); - PrimExpr const_res = TryConstFold(op->a, op->b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(op->a, op->b)) return const_res.value(); // Pattern var to match any expression PVar x, y, z, b1; @@ -748,8 +743,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const ModNode* op) { PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) { PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op); op = ret.as(); - PrimExpr const_res = TryConstFold(op->a, op->b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(op->a, op->b)) return const_res.value(); // Pattern var to match any expression PVar x, y, z, b1; // Pattern var match IntImm @@ -895,8 +889,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) { PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) { PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op); op = ret.as(); - PrimExpr const_res = TryConstFold(op->a, op->b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(op->a, op->b)) return const_res.value(); // Pattern var to match any expression PVar x, y, z, b1; @@ -977,8 +970,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) { PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MinNode* op) { PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op); op = ret.as(); - PrimExpr const_res = TryConstFold(op->a, op->b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(op->a, op->b)) return const_res.value(); // Pattern var to match any expression PVar x, y, z, s1, s2; @@ -1149,8 +1141,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MinNode* op) { PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MaxNode* op) { PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op); op = ret.as(); - PrimExpr const_res = TryConstFold(op->a, op->b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(op->a, op->b)) return const_res.value(); // Pattern var to match any expression PVar x, y, z, s1, s2; @@ -1327,8 +1318,7 @@ Optional RewriteSimplifier::Impl::TryMatchLiteralConstraint(const Prim PrimExpr RewriteSimplifier::Impl::VisitExpr_(const EQNode* op) { PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op); op = ret.as(); - PrimExpr const_res = TryConstFold(op->a, op->b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(op->a, op->b)) return const_res.value(); if (auto match = TryMatchLiteralConstraint(ret)) return match.value(); // Pattern var to match any expression @@ -1376,8 +1366,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const GENode* op) { PrimExpr RewriteSimplifier::Impl::VisitExpr_(const LTNode* op) { PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op); op = ret.as(); - PrimExpr const_res = TryConstFold(op->a, op->b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(op->a, op->b)) return const_res.value(); if (auto match = TryMatchLiteralConstraint(ret)) return match.value(); // Pattern var to match any expression @@ -1508,8 +1497,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const LTNode* op) { PrimExpr RewriteSimplifier::Impl::VisitExpr_(const NotNode* op) { PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op); op = ret.as(); - PrimExpr const_res = TryConstFold(op->a); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(op->a)) return const_res.value(); if (auto match = TryMatchLiteralConstraint(ret)) return match.value(); // Pattern var to match any expression @@ -1534,8 +1522,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const NotNode* op) { PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) { PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op); op = ret.as(); - PrimExpr const_res = TryConstFold(op->a, op->b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(op->a, op->b)) return const_res.value(); if (auto match = TryMatchLiteralConstraint(ret)) return match.value(); // Pattern var to match any expression @@ -1574,8 +1561,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) { PrimExpr RewriteSimplifier::Impl::VisitExpr_(const OrNode* op) { PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op); op = ret.as(); - PrimExpr const_res = TryConstFold(op->a, op->b); - if (const_res.defined()) return const_res; + if (auto const_res = TryConstFold(op->a, op->b)) return const_res.value(); if (auto match = TryMatchLiteralConstraint(ret)) return match.value(); // Pattern var to match any expression diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc index b9e0c3c37068..509badbebb92 100644 --- a/src/tir/op/op.cc +++ b/src/tir/op/op.cc @@ -327,8 +327,7 @@ PrimExpr operator+(PrimExpr a, PrimExpr b) { return add(a, b); } PrimExpr add(PrimExpr a, PrimExpr b, Span span) { BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::Add(a, b, span); } @@ -349,23 +348,20 @@ PrimExpr operator-(PrimExpr a, PrimExpr b) { return sub(a, b); } PrimExpr sub(PrimExpr a, PrimExpr b, Span span) { BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::Sub(a, b, span); } PrimExpr operator*(PrimExpr a, PrimExpr b) { return mul(a, b); } PrimExpr mul(PrimExpr a, PrimExpr b, Span span) { BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::Mul(a, b, span); } PrimExpr div(PrimExpr a, PrimExpr b, Span span) { BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::Div(a, b, span); } @@ -377,8 +373,7 @@ PrimExpr truncdiv(PrimExpr a, PrimExpr b, Span span) { PrimExpr truncmod(PrimExpr a, PrimExpr b, Span span) { BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::Mod(a, b, span); } @@ -397,8 +392,7 @@ PrimExpr floordiv(PrimExpr a, PrimExpr b, Span span) { ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a; ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b; BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::FloorDiv(a, b, span); } @@ -406,8 +400,7 @@ PrimExpr ceildiv(PrimExpr a, PrimExpr b, Span span) { ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a; ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b; BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a + b - 1, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a + b - 1, b)) return ret.value(); return tir::FloorDiv(a + b - 1, b, span); } @@ -415,8 +408,7 @@ PrimExpr floormod(PrimExpr a, PrimExpr b, Span span) { ICHECK(a.dtype().is_int() || a.dtype().is_uint()) << a; ICHECK(b.dtype().is_int() || b.dtype().is_uint()) << b; BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::FloorMod(a, b, span); } @@ -429,8 +421,7 @@ PrimExpr min(PrimExpr a, PrimExpr b, Span span) { if (is_pos_inf(b)) return a; if (is_neg_inf(b)) return b; BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::Min(a, b, span); } @@ -443,8 +434,7 @@ PrimExpr max(PrimExpr a, PrimExpr b, Span span) { if (is_pos_inf(b)) return b; if (is_neg_inf(b)) return a; BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::Max(a, b, span); } @@ -475,48 +465,42 @@ PrimExpr likely(PrimExpr cond, Span span) { PrimExpr operator>(PrimExpr a, PrimExpr b) { return greater(a, b); } PrimExpr greater(PrimExpr a, PrimExpr b, Span span) { BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::GT(a, b, span); } PrimExpr operator>=(PrimExpr a, PrimExpr b) { return greater_equal(a, b); } PrimExpr greater_equal(PrimExpr a, PrimExpr b, Span span) { BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::GE(a, b, span); } PrimExpr operator<(PrimExpr a, PrimExpr b) { return less(a, b); } PrimExpr less(PrimExpr a, PrimExpr b, Span span) { BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::LT(a, b, span); } PrimExpr operator<=(PrimExpr a, PrimExpr b) { return less_equal(a, b); } PrimExpr less_equal(PrimExpr a, PrimExpr b, Span span) { BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::LE(a, b, span); } PrimExpr operator==(PrimExpr a, PrimExpr b) { return equal(a, b); } PrimExpr equal(PrimExpr a, PrimExpr b, Span span) { BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::EQ(a, b, span); } PrimExpr operator!=(PrimExpr a, PrimExpr b) { return not_equal(a, b); } PrimExpr not_equal(PrimExpr a, PrimExpr b, Span span) { BinaryOpMatchTypes(a, b, span); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::NE(a, b, span); } @@ -551,24 +535,21 @@ void type_check_integer_args(const PrimExpr& lhs, const PrimExpr& rhs, const cha PrimExpr operator&&(PrimExpr a, PrimExpr b) { return logical_and(a, b); } PrimExpr logical_and(PrimExpr a, PrimExpr b, Span span) { type_check_boolean_args(a, b, "&& operator (logical AND)"); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::And(a, b, span); } PrimExpr operator||(PrimExpr a, PrimExpr b) { return logical_or(a, b); } PrimExpr logical_or(PrimExpr a, PrimExpr b, Span span) { type_check_boolean_args(a, b, "|| operator (logical OR)"); - PrimExpr ret = arith::TryConstFold(a, b); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a, b)) return ret.value(); return tir::Or(a, b, span); } PrimExpr operator!(PrimExpr a) { return logical_not(a); } PrimExpr logical_not(PrimExpr a, Span span) { type_check_boolean_args(a, "! operator (logical NOT)"); - PrimExpr ret = arith::TryConstFold(a); - if (ret.defined()) return ret; + if (auto ret = arith::TryConstFold(a)) return ret.value(); return tir::Not(a, span); } From 1f8b5dec29e6e34b4cf5f092acf5b1d197a59d42 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 15 Sep 2022 13:15:10 -0700 Subject: [PATCH 170/704] [TIR, Schedule] Add schedule primitive PadEinsum (#12750) * [TIR, Schedule] Add schedule primitive PadEinsum Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> * lint * [TIR] Fix producer indices check in PadEinsum * address comments * simplify lambda expr * fix Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> --- include/tvm/tir/schedule/schedule.h | 20 + python/tvm/tir/schedule/schedule.py | 122 +++++ src/tir/schedule/analysis.h | 27 + src/tir/schedule/analysis/analysis.cc | 29 ++ src/tir/schedule/concrete_schedule.cc | 6 + src/tir/schedule/concrete_schedule.h | 1 + src/tir/schedule/primitive.h | 11 +- .../primitive/layout_transformation.cc | 36 +- src/tir/schedule/primitive/pad_einsum.cc | 474 ++++++++++++++++++ src/tir/schedule/schedule.cc | 3 +- src/tir/schedule/traced_schedule.cc | 12 +- src/tir/schedule/traced_schedule.h | 3 +- src/tir/schedule/transform.cc | 8 + src/tir/schedule/transform.h | 7 +- .../unittest/test_tir_schedule_pad_einsum.py | 122 +++++ 15 files changed, 841 insertions(+), 40 deletions(-) create mode 100644 src/tir/schedule/primitive/pad_einsum.cc create mode 100644 tests/python/unittest/test_tir_schedule_pad_einsum.py diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h index da399ab976d6..8e5cd34d2e0b 100644 --- a/include/tvm/tir/schedule/schedule.h +++ b/include/tvm/tir/schedule/schedule.h @@ -627,6 +627,7 @@ class ScheduleNode : public runtime::Object { BufferIndexType buffer_index_type, const Array& axis_separators) = 0; + /******** Schedule: Padding ********/ /*! * \brief Decompose a padding block into a block filling const pad values and a block * writing in-bound values. @@ -636,6 +637,25 @@ class ScheduleNode : public runtime::Object { */ virtual BlockRV DecomposePadding(const BlockRV& block_rv, const LoopRV& loop_rv) = 0; + /*! + * \brief Pad the computation of Einsum. + * \param block_rv The block that matches the Einsum pattern. + * \param padding The padding for each block iter. + * \details This schedule primitives identifies the Einsum pattern in the block body, and find its + * producer blocks. It then pads the computation of the Einsum pattern and its producer blocks. + * The output buffer and the producer buffer is resized according to the padding size. It requires + * the output buffer and the producer buffer to be allocated inside the PrimFunc. + * + * The padding is a list of non-negative integers, each element corresponds to the padding for + * each block iter in the order of block iters. The block and its producer blocks should have + * trivial bindings, i.e. each block iter is bound to a single loop variable. After padding, the + * block iter extent and the corresponding outer loop is extended by the padding size. + * + * The size of the producer buffers are infered from the padding size of the Einsum computation. + * The producer buffers are padded by the initial value of the corresponding reduction. + */ + virtual void PadEinsum(const BlockRV& block_rv, const Array& padding) = 0; + /******** Schedule: Misc ********/ /*! \brief A no-op that marks the start of postprocessing phase of scheduling */ virtual void EnterPostproc() = 0; diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py index d1293371a0e0..fdc871703275 100644 --- a/python/tvm/tir/schedule/schedule.py +++ b/python/tvm/tir/schedule/schedule.py @@ -2783,6 +2783,128 @@ def can_decompose_padding(self, block: Union[BlockRV, str], loop: LoopRV) -> boo """Check whether the block match padding pattern and can be decomposed.""" return _ffi_api.CanDecomposePadding(self, block, loop) # type: ignore # pylint: disable=no-member + @type_checked + def pad_einsum(self, block: Union[BlockRV, str], padding: List[int]) -> None: + """Pad the computation of Einsum. + + This schedule primitives identifies the Einsum pattern in the block body, and find its + producer blocks. It then pads the computation of the Einsum pattern and its producer blocks. + The output buffer and the producer buffer is resized according to the padding size. It + requires the output buffer and the producer buffer to be allocated inside the PrimFunc. + + The padding is a list of non-negative integers, each element corresponds to the padding for + each block iter in the order of block iters. The block and it's producer blocks should have + trivial bindings, i.e. each block iter is bound to a single loop variable. After padding, + thblock iter extent and the corresponding outer loop is extended by the padding size. + + The size of the producer buffers are infered from the padding size of the Einsum + computation. The producer buffers are padded by the initial value of the corresponding + reduction. + + Parameters + ---------- + block : Union[BlockRV, str] + The block that matches the Einsum pattern. + + padding : List[int] + The padding for each block iter. + + Examples + -------- + + Before applying pad-einsum, in TensorIR, the IR is: + + .. code-block:: python + + @T.prim_func + def before_pad_einsum( + A: T.Buffer[(128, 127), "float32"], + B: T.Buffer[(127, 127), "float32"], + C: T.Buffer[(128, 127), "float32"], + ) -> None: + A_shared = T.alloc_buffer((128, 127), "float32", scope="shared") + B_shared = T.alloc_buffer((127, 127), "float32", scope="shared") + C_shared = T.alloc_buffer((128, 127), "float32", scope="shared") + for i0, i1 in T.grid(128, 127): + with T.block("A"): + i, j = T.axis.remap("SS", [i0, i1]) + A_shared[i, j] = A[i, j] + for i0, i1 in T.grid(127, 127): + with T.block("B"): + i, j = T.axis.remap("SS", [i0, i1]) + B_shared[i, j] = B[i, j] + for i0, i1, i2 in T.grid(128, 127, 127): + with T.block("C_shared"): + i, j, k = T.axis.remap("SSR", [i0, i1, i2]) + with T.init(): + C_shared[i, j] = T.float32(0) + C_shared[i, j] = C_shared[i, j] + A_shared[i, k] * B_shared[k, j] + for i0, i1 in T.grid(128, 127): + with T.block("C"): + i, j = T.axis.remap("SS", [i0, i1]) + C[i, j] = C_shared[i, j] + + Create the schedule and do pad-einsum with specified block: + + .. code-block:: python + + sch = tir.Schedule(before_pad_einsum, debug_mask="all") + block = sch.get_block("C_shared") + sch.pad_einsum(block, [0, 1, 1]) + print(sch.mod["main"].script()) + + After applying decompose-padding, the IR becomes: + + .. code-block:: python + + @T.prim_func + def after_pad_einsum( + A: T.Buffer[(128, 127), "float32"], + B: T.Buffer[(127, 127), "float32"], + C: T.Buffer[(128, 127), "float32"], + ) -> None: + A_shared_padded = T.alloc_buffer([128, 128], dtype="float32", scope="shared") + B_shared_padded = T.alloc_buffer([128, 128], dtype="float32", scope="shared") + C_shared_padded = T.alloc_buffer([128, 128], dtype="float32", scope="shared") + for i0, i1 in T.grid(128, 128): + with T.block("A"): + i, j = T.axis.remap("SS", [i0, i1]) + T.reads(A[i, j]) + T.writes(A_shared_padded[i, j]) + A_shared_padded[i, j] = T.if_then_else( + j < 127, A[i, j], T.float32(0), dtype="float32" + ) + for i0, i1 in T.grid(128, 128): + with T.block("B"): + i, j = T.axis.remap("SS", [i0, i1]) + T.reads(B[i, j]) + T.writes(B_shared_padded[i, j]) + B_shared_padded[i, j] = T.if_then_else( + i < 127 and j < 127, B[i, j], T.float32(0), dtype="float32" + ) + for i0, i1, i2 in T.grid(128, 128, 128): + with T.block("C_shared"): + i, j, k = T.axis.remap("SSR", [i0, i1, i2]) + T.reads(A_shared_padded[i, k], B_shared_padded[k, j]) + T.writes(C_shared_padded[i, j]) + with T.init(): + C_shared_padded[i, j] = T.float32(0) + C_shared_padded[i, j] = ( + C_shared_padded[i, j] + A_shared_padded[i, k] * B_shared_padded[k, j] + ) + for i0, i1 in T.grid(128, 127): + with T.block("C"): + i, j = T.axis.remap("SS", [i0, i1]) + T.reads(C_shared_padded[i, j]) + T.writes(C[i, j]) + C[i, j] = C_shared_padded[i, j] + + """ + block = self._normalize_block_arg(block) + return _ffi_api.SchedulePadEinsum( # type: ignore # pylint: disable=no-member + self, block, padding + ) + ########## Schedule: Misc ########## @type_checked diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h index 489df8959d1b..ca45bcac6b34 100644 --- a/src/tir/schedule/analysis.h +++ b/src/tir/schedule/analysis.h @@ -298,6 +298,15 @@ bool GetVarsTouchedByBlockIters(const BlockRealize& block_realize, void CheckLoopStartsWithZero(const ScheduleState& self, const StmtSRef& loop_sref, arith::Analyzer* analyzer); +/*! + * \brief Check whether a block has a trivial binding, i.e. each block var is bound to a outer loop, + * from outer to inner. + * \param self The schedule state + * \param block_sref The block to be checked + * \throw ScheduleError If the block does not have trivial bindings + */ +void CheckBlockHasTrivialBinding(const ScheduleState& self, const StmtSRef& block_sref); + /******** Block-loop relation ********/ /*! @@ -697,6 +706,24 @@ Array AnalyzeRegionLowerBound(const BufferRegion& region, const P const StmtSRef& dom_high_exclusive, arith::Analyzer* analyzer); +/*! + * \brief Check if buffer indices are all Vars and extr + * \param buffer_access The BufferLoad or BufferStore + * \return The indices if the indices are all Vars, otherwise NullOpt + */ +template +Optional> CheckTrivialBufferIndices(const T& buffer_access) { + Array indices; + for (const PrimExpr& index : buffer_access->indices) { + const VarNode* var = index.as(); + if (var == nullptr) { + return NullOpt; + } + indices.push_back(GetRef(var)); + } + return indices; +} + /*! \brief Necessary information used for tensorization */ class TensorizeInfoNode : public Object { public: diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc index 7ed60876ab22..4f78b0c9cd43 100644 --- a/src/tir/schedule/analysis/analysis.cc +++ b/src/tir/schedule/analysis/analysis.cc @@ -652,6 +652,35 @@ void CheckAffineBinding(const ScheduleState& self, Block block) { CheckPartialAffineBinding(self, std::move(block), NullOpt); } +void CheckBlockHasTrivialBinding(const ScheduleState& self, const StmtSRef& block_sref) { + class NotTrivialBindingError : public ScheduleError { + public: + explicit NotTrivialBindingError(IRModule mod, Block block) + : mod_(std::move(mod)), block_(std::move(block)) {} + + String FastErrorString() const final { + return "ScheduleError: The binding values of the block are not variables of outer loops."; + } + + String DetailRenderTemplate() const final { + std::ostringstream os; + os << "The binding values of the {0} are not variables of outer loops."; + return os.str(); + } + + IRModule mod() const final { return mod_; } + Array LocationsOfInterest() const final { return {block_}; } + + private: + IRModule mod_; + Block block_; + }; + + if (!IsTrivialBinding(self, block_sref)) { + throw NotTrivialBindingError(self->mod, GetRef(block_sref->StmtAs())); + } +} + Map LoopDomainOfSRefTreePath(const StmtSRef& low_inclusive, const Optional& high_exclusive, const runtime::StorageScope& extra_relax_scope) { diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc index afc675799706..9d7dc6b95f50 100644 --- a/src/tir/schedule/concrete_schedule.cc +++ b/src/tir/schedule/concrete_schedule.cc @@ -795,6 +795,12 @@ BlockRV ConcreteScheduleNode::DecomposePadding(const BlockRV& block_rv, const Lo return CreateRV(result); } +void ConcreteScheduleNode::PadEinsum(const BlockRV& block_rv, const Array& padding) { + TVM_TIR_SCHEDULE_BEGIN(); + tir::PadEinsum(state_, this->GetSRef(block_rv), padding); + TVM_TIR_SCHEDULE_END("pad-einsum", this->error_render_level_); + this->state_->DebugVerify(); +} /******** Schedule: Misc ********/ } // namespace tir diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h index e79d1d528809..1aa9dafcc93e 100644 --- a/src/tir/schedule/concrete_schedule.h +++ b/src/tir/schedule/concrete_schedule.h @@ -128,6 +128,7 @@ class ConcreteScheduleNode : public ScheduleNode { /******** Schedule: Reduction ********/ BlockRV RFactor(const LoopRV& loop_rv, int factor_axis) override; BlockRV DecomposeReduction(const BlockRV& block_rv, const LoopRV& loop_rv) override; + void PadEinsum(const BlockRV& block_rv, const Array& padding) override; /******** Schedule: Block annotation ********/ void StorageAlign(const BlockRV& block_rv, int buffer_index, int axis, int factor, int offset) override; diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h index 05d9e4cf944a..97233fe4bc6f 100644 --- a/src/tir/schedule/primitive.h +++ b/src/tir/schedule/primitive.h @@ -490,7 +490,7 @@ TVM_DLL void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int TVM_DLL void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref, const IndexMap& index_map); -/******** Schedule: Padding decomposition ********/ +/******** Schedule: Padding ********/ /*! * \brief Decompose a padding block into a block filling const pad values and a block * writing in-bound values. @@ -501,6 +501,15 @@ TVM_DLL void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref TVM_DLL StmtSRef DecomposePadding(ScheduleState self, const StmtSRef& block_sref, const StmtSRef& loop_sref); +/*! + * \brief Pad the computation of Einsum. + * \param self The state of the schedule + * \param block_sref The block sref that matches the Einsum pattern. + * \param padding The padding for each block iter. + */ +TVM_DLL void PadEinsum(ScheduleState self, const StmtSRef& block_sref, + const Array& padding); + /******** Schedule: Misc ********/ } // namespace tir diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc index 8e2643db0103..32ed279f028f 100644 --- a/src/tir/schedule/primitive/layout_transformation.cc +++ b/src/tir/schedule/primitive/layout_transformation.cc @@ -278,40 +278,6 @@ class IndexMapNotApplicableToBlockIterError : public ScheduleError { IndexMap index_map_; }; -class NotTrivialBindingError : public ScheduleError { - public: - explicit NotTrivialBindingError(IRModule mod, Block block) - : mod_(std::move(mod)), block_(std::move(block)) {} - - static void CheckBlockHasTrivialBinding(const IRModule& mod, const BlockRealize& block_realize, - std::unordered_set outer_loop_vars) { - // Step 2: Check all the binding values are loops vars - for (const PrimExpr& iter_value : block_realize->iter_values) { - const VarNode* loop_var = iter_value.as(); - if (!loop_var || !outer_loop_vars.count(loop_var)) { - throw NotTrivialBindingError(mod, block_realize->block); - } - } - } - - String FastErrorString() const final { - return "ScheduleError: The binding values of the block are not variables of outer loops."; - } - - String DetailRenderTemplate() const final { - std::ostringstream os; - os << "The binding values of the {0} are not variables of outer loops."; - return os.str(); - } - - IRModule mod() const final { return mod_; } - Array LocationsOfInterest() const final { return {block_}; } - - private: - IRModule mod_; - Block block_; -}; - class OpaqueNewIterTypeError : public ScheduleError { public: explicit OpaqueNewIterTypeError(IRModule mod, Block block, PrimExpr iter_value) @@ -363,7 +329,7 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref, } BlockRealize block_realize = GetBlockRealize(self, block_sref); - NotTrivialBindingError::CheckBlockHasTrivialBinding(self->mod, block_realize, loop_vars); + CheckBlockHasTrivialBinding(self, block_sref); // Step 3: Collect information of block iter vars Array block_vars; // iter_var->var of each block iter diff --git a/src/tir/schedule/primitive/pad_einsum.cc b/src/tir/schedule/primitive/pad_einsum.cc new file mode 100644 index 000000000000..7a7b88d686f9 --- /dev/null +++ b/src/tir/schedule/primitive/pad_einsum.cc @@ -0,0 +1,474 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include "../utils.h" + +namespace tvm { +namespace tir { + +/*! \brief The schedule error class when the padding size is invalid. */ +class InvalidPaddingError : public ScheduleError { + public: + InvalidPaddingError(IRModule mod, Block block, Array padding) + : mod_(std::move(mod)), block_(std::move(block)), padding_(std::move(padding)) {} + IRModule mod() const final { return mod_; } + Array LocationsOfInterest() const final { return {block_}; } + String FastErrorString() const final { + return "ScheduleError: The padding size for the block is invalid."; + } + String DetailRenderTemplate() const final { + std::ostringstream os; + os << "The padding for the block {0} are invalid. It should be a list of " + << block_->iter_vars.size() << " non-negative integers. Got " << padding_; + return os.str(); + } + + static void Check(const ScheduleState& self, const Block& block, Array padding) { + if (padding.size() != block->iter_vars.size()) { + throw InvalidPaddingError(self->mod, block, padding); + } + for (const auto& pad : padding) { + if (pad->value < 0) { + throw InvalidPaddingError(self->mod, block, padding); + } + } + } + + private: + IRModule mod_; + Block block_; + Array padding_; +}; + +/*! \brief The schedule error class when the block body is not an Einsum pattern. */ +class NonEinsumError : public ScheduleError { + public: + explicit NonEinsumError(IRModule mod, Block block) + : mod_(std::move(mod)), block_(std::move(block)) {} + + IRModule mod() const final { return mod_; } + Array LocationsOfInterest() const final { return {block_}; } + String FastErrorString() const final { + return "ScheduleError: The block is not a computation of Einsum pattern."; + } + String DetailRenderTemplate() const final { + return "The block {0} not a computation of Einsum pattern."; + } + + private: + IRModule mod_; + Block block_; +}; + +/*! \brief Data structure that represents a Einsum computation. */ +struct Einsum { + // The output buffer + Buffer output_buffer; + // The indices of the output buffer + Array output_indices; + // The indices of the input buffers + Map> input_indices; +}; + +class EinsumExtractor : public ExprVisitor { + public: + EinsumExtractor() = default; + + std::optional Extract(const Block& block) { + const BufferStoreNode* update = block->body.as(); + // Step 1: Check the body is a BufferStore and the block has the init statement, and the + // BufferStore and the init statement store have the same output buffer indices. + if (update == nullptr || !block->init.defined()) { + return std::nullopt; + } + + if (Optional> opt_indices = CheckTrivialBufferIndices(update); + opt_indices.defined()) { + ein_sum_.output_indices = std::move(opt_indices.value()); + } else { + return std::nullopt; + } + ein_sum_.output_buffer = update->buffer; + + const BufferStoreNode* init = block->init.value().as(); + ICHECK(init != nullptr); + if (!CompareBufferIndices(init->indices, ein_sum_.output_indices)) { + return std::nullopt; + } + // Step 2: Check the BufferStore updates the output buffer and the input buffers indices are + // block iter variables. + CheckStoreValue(update->value); + if (fail_) { + return std::nullopt; + } + return std::move(ein_sum_); + } + + private: + void CheckStoreValue(const PrimExpr& update) { + // Check the update part has the form: + // Output[output_indices] += Input_0[input_indices_0] op_0 Input_1[input_indices_1] op_1 ... + // where output_indices and input_indices_i are the indices are arrays whose elements are the + // block iter variables instead of composite PrimExpr, and op_i are the binary operations. + + // Check the value is Add and eithe LHS or RHS is the BufferLoad from the output buffer. + const AddNode* add = update.as(); + if (add == nullptr) { + fail_ = true; + return; + } + const BufferLoadNode* lhs = add->a.as(); + const BufferLoadNode* rhs = add->b.as(); + if (lhs == nullptr && rhs != nullptr) { + std::swap(lhs, rhs); + } + if (lhs == nullptr || !lhs->buffer.same_as(ein_sum_.output_buffer) || + !CompareBufferIndices(lhs->indices, ein_sum_.output_indices)) { + fail_ = true; + return; + } + VisitExpr(add->b); + } + + void VisitExpr(const PrimExpr& n) final { + if (n->IsInstance() || n->IsInstance() || n->IsInstance()) { + ExprVisitor::VisitExpr(n); + } else { + fail_ = true; + return; + } + } + + void VisitExpr_(const BufferLoadNode* op) final { + if (auto it = ein_sum_.input_indices.find(op->buffer); + it != ein_sum_.input_indices.end() && !CompareBufferIndices(op->indices, (*it).second)) { + fail_ = true; + return; + } + if (Optional> opt_indices = CheckTrivialBufferIndices(op); opt_indices.defined()) { + ein_sum_.input_indices.Set(op->buffer, std::move(opt_indices.value())); + } else { + fail_ = true; + return; + } + } + + void VisitExpr_(const CastNode* op) { VisitExpr(op->value); } + + bool Fail() { return fail_; } + + bool CompareBufferIndices(const Array& indices, const Array& other) { + return std::equal(indices.begin(), indices.end(), other.begin(), other.end(), + [](const PrimExpr& a, const Var& b) { return a.same_as(b); }); + } + + Einsum ein_sum_; + bool fail_{false}; +}; + +Einsum ExtractEinsum(const ScheduleState& self, const Block& block) { + EinsumExtractor extractor; + std::optional einsum = extractor.Extract(block); + if (!einsum.has_value()) { + throw NonEinsumError(self->mod, block); + } + return einsum.value(); +} + +class BufferNotAllocatedInScopeError : public ScheduleError { + public: + explicit BufferNotAllocatedInScopeError(IRModule mod, Buffer buffer) + : mod_(std::move(mod)), buffer_(std::move(buffer)) {} + + String FastErrorString() const final { + return "ScheduleError: The buffer is not allocated as an intermediate buffer in current " + "PrimFunc."; + } + + String DetailRenderTemplate() const final { + std::ostringstream os; + os << "The buffer " << buffer_->name + << " is not allocated as an intermediate buffer in current PrimFunc."; + return os.str(); + } + + IRModule mod() const final { return mod_; } + Array LocationsOfInterest() const final { return {}; } + + private: + IRModule mod_; + Buffer buffer_; +}; + +class PadEinsumRewriter : public ReplaceBufferMutator { + public: + PadEinsumRewriter(const std::unordered_map producer_predicate, + Map padded_iter_extents, const Map& buffer_remap, + Map* block_sref_reuse, arith::Analyzer* analyzer) + : ReplaceBufferMutator(buffer_remap, block_sref_reuse), + producer_predicate_(producer_predicate), + padded_iter_extents_(padded_iter_extents), + analyzer_(analyzer) {} + + Stmt VisitStmt_(const ForNode* op) final { + For new_for = Downcast(ReplaceBufferMutator::VisitStmt_(op)); + if (padded_iter_extents_.count(new_for->loop_var)) { + new_for.CopyOnWrite()->extent = padded_iter_extents_.at(new_for->loop_var); + } + return std::move(new_for); + } + + Block PadProducerBlock(Block block, const PrimExpr& predicate) { + BufferStore store = Downcast(block->body); + store.CopyOnWrite()->value = + analyzer_->Simplify(if_then_else(predicate, store->value, make_zero(store->value.dtype()))); + block.CopyOnWrite()->body = std::move(store); + return block; + } + + Stmt VisitStmt_(const BlockNode* op) final { + Block old_block = GetRef(op); + Block new_block = Downcast(ReplaceBufferMutator::VisitStmt_(op)); + if (auto it = producer_predicate_.find(op); it != producer_predicate_.end()) { + new_block = PadProducerBlock(std::move(new_block), (*it).second); + } + + // Mutate block iters + Array new_iters; + bool changed = false; + for (const IterVar& iter : new_block->iter_vars) { + if (auto it = padded_iter_extents_.find(iter->var); it != padded_iter_extents_.end()) { + changed = true; + new_iters.push_back( + IterVar(Range::FromMinExtent(0, (*it).second), iter->var, iter->iter_type)); + } else { + new_iters.push_back(iter); + } + } + if (changed) { + new_block.CopyOnWrite()->iter_vars = std::move(new_iters); + } + if (!old_block.same_as(new_block)) { + block_sref_reuse_->Set(old_block, new_block); + } + return std::move(new_block); + } + + private: + const std::unordered_set producer_blocks_; + const std::unordered_map producer_predicate_; + const Map padded_iter_extents_; + arith::Analyzer* analyzer_; +}; + +/*! \brief The schedule error class when the producer block cannot be padded. */ +class InvalidProducerError : public ScheduleError { + public: + explicit InvalidProducerError(IRModule mod, Block producer) + : mod_(std::move(mod)), producer_(std::move(producer)) {} + + String FastErrorString() const final { + return "ScheduleError: The producer block cannot be padded."; + } + + String DetailRenderTemplate() const final { + std::ostringstream os; + os << "The producer block {0} cannot be padded. It should write to a single buffer and the " + "body should be a BufferStore."; + return os.str(); + } + + IRModule mod() const final { return mod_; } + Array LocationsOfInterest() const final { return {producer_}; } + + private: + IRModule mod_; + Buffer buffer_; + Block producer_; +}; + +void PadEinsum(ScheduleState self, const StmtSRef& block_sref, const Array& padding) { + arith::Analyzer analyzer; + // Step 1: Input checking and error handling + const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); + BlockRealize realize = GetBlockRealize(self, block_sref); + + const StmtSRef& scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true); + InvalidPaddingError::Check(self, GetRef(block), padding); + + const Array producers = GetProducers(self, block_sref); + { + auto f_check_block_properties = [&](const StmtSRef& block_sref, bool is_producer) { + CheckBlockHasTrivialBinding(self, block_sref); + if (is_producer) { + CheckCompleteBlock(self, block_sref, scope_sref); + } else { + CheckReductionBlock(self, block_sref, scope_sref); + } + Array loops = GetLoops(block_sref); + ICHECK(!loops.empty()); + CheckGetSingleChildBlockRealizeOnSRefTree(self, loops.front()); + }; + + // Check block properties of the computation block + f_check_block_properties(block_sref, false); + + // Check block properties of the producer block + for (const StmtSRef& producer_sref : producers) { + f_check_block_properties(producer_sref, true); + } + } + + Einsum einsum = ExtractEinsum(self, GetRef(block)); + + // Check input and output buffers are all allocated in the current scope. + { + auto f_check_buffer_allocated = [&](const Buffer& buffer) { + auto [defining_site_sref, is_allocate] = GetBufferDefiningSite(block_sref, buffer); + if (!defining_site_sref.defined() || !is_allocate) { + throw BufferNotAllocatedInScopeError(self->mod, buffer); + } + }; + f_check_buffer_allocated(einsum.output_buffer); + for (const auto& buffer_indices_pair : einsum.input_indices) { + f_check_buffer_allocated(buffer_indices_pair.first); + } + } + + // Step 2: Prepare buffer and variable remapping. Infer the new shape of the input and the output + // buffers. Infer the new extent of the block iters of the computation block and the producer + // block. + + Map padded_iter_extents; // The new extents of both the block iters and loop vars + + // Convert the input padding array to a map from variables to the padded extents + for (int i = 0, n = padding.size(); i < n; ++i) { + const IterVar& iter = block->iter_vars[i]; + PrimExpr new_extent = + IntImm(iter->var->dtype, Downcast(iter->dom->extent)->value + padding[i]->value); + padded_iter_extents.Set(iter->var, new_extent); + padded_iter_extents.Set(Downcast(realize->iter_values[i]), new_extent); + } + + Map buffer_remap; // mapping from buffers to new buffers with padded shapes + + // Utility function to pad a buffer with the new shape + auto f_pad_buffer = [&padded_iter_extents, &buffer_remap](Buffer buffer, + const Array& indices) -> Buffer { + Array new_shape; + for (const Var& index : indices) { + new_shape.push_back(padded_iter_extents.at(index)); + } + ICHECK_EQ(buffer->shape.size(), new_shape.size()); + buffer.CopyOnWrite()->shape = std::move(new_shape); + return buffer; + }; + + buffer_remap.Set(einsum.output_buffer, f_pad_buffer(einsum.output_buffer, einsum.output_indices)); + + std::unordered_map producer_predicate; + + // Different from the output block, the padding for the producer block is not directly specified + // as the input argument. Instead, it is inferred from indices of the producer buffer accessed in + // the output block. + // We will find the indices (which are block iters) in BufferStore to the producer buffer + // and infer the new extents of the block iters and the corresponding loop vars. + for (const StmtSRef& producer_sref : producers) { + const BlockNode* producer_block = TVM_SREF_TO_BLOCK(producer_sref); + const BufferStoreNode* buffer_store = producer_block->body.as(); + Optional> producer_store_indices; + if (!buffer_store || producer_block->writes.size() != 1 || + !(producer_store_indices = CheckTrivialBufferIndices(buffer_store)).defined()) { + throw InvalidProducerError(self->mod, GetRef(producer_block)); + } + BlockRealize producer_realize = GetBlockRealize(self, producer_sref); + + const Buffer& old_buffer = producer_block->writes[0]->buffer; + Buffer new_buffer = f_pad_buffer(old_buffer, einsum.input_indices.at(old_buffer)); + buffer_remap.Set(old_buffer, new_buffer); + + // The predicate to ensure the producer block is in the original bound before padding + PrimExpr predicate = Bool(true); + Map indices_to_padded_extents; // buffer indices to padded extents + for (int i = 0, n = producer_store_indices.value().size(); i < n; ++i) { + const Var& index = producer_store_indices.value()[i]; + PrimExpr padded_extent = new_buffer->shape[i]; + if (!analyzer.CanProveEqual(padded_extent, old_buffer->shape[i])) { + predicate = predicate && (index < old_buffer->shape[i]); + } + indices_to_padded_extents.Set(index, padded_extent); + } + + for (int i = 0, n = producer_block->iter_vars.size(); i < n; ++i) { + const IterVar& iter = producer_block->iter_vars[i]; + if (auto it = indices_to_padded_extents.find(iter->var); + it != indices_to_padded_extents.end()) { + const PrimExpr& padded_extent = (*it).second; + padded_iter_extents.Set(iter->var, padded_extent); + padded_iter_extents.Set(Downcast(producer_realize->iter_values[i]), padded_extent); + } else if (!is_one(iter->dom->extent)) { + throw InvalidProducerError(self->mod, GetRef(producer_block)); + } + } + producer_predicate[producer_block] = predicate; + } + + // Step 3: Mutate the AST subtree with the new buffers and the new block iter extents. + Map block_sref_reuse; + PadEinsumRewriter rewriter(producer_predicate, padded_iter_extents, buffer_remap, + &block_sref_reuse, &analyzer); + const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_sref); + Stmt new_scope_block = rewriter(GetRef(scope_block)); + + // Step 4: Do the actual replacement. + self->Replace(scope_sref, new_scope_block, block_sref_reuse); +} + +/******** Instruction Registration ********/ + +struct PadEinsumTraits : public UnpackedInstTraits { + static constexpr const char* kName = "PadEinsum"; + static constexpr bool kIsPure = false; + + private: + static constexpr size_t kNumInputs = 1; + static constexpr size_t kNumAttrs = 1; + static constexpr size_t kNumDecisions = 0; + + static void UnpackedApplyToSchedule(Schedule sch, BlockRV block, Array padding) { + sch->PadEinsum(block, padding); + } + + static String UnpackedAsPython(Array outputs, String block, Array padding) { + PythonAPICall py("pad_einsum"); + py.Input("block", block); + py.Input("padding", padding); + return py.Str(); + } + + template + friend struct ::tvm::tir::UnpackedInstTraits; +}; + +TVM_REGISTER_INST_KIND_TRAITS(PadEinsumTraits); + +} // namespace tir +} // namespace tvm diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc index 091db344aadb..d72f67fb7c2d 100644 --- a/src/tir/schedule/schedule.cc +++ b/src/tir/schedule/schedule.cc @@ -264,7 +264,8 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleSetAxisSeparator") /******** (FFI) Padding decomposition ********/ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleDecomposePadding") .set_body_method(&ScheduleNode::DecomposePadding); - +TVM_REGISTER_GLOBAL("tir.schedule.SchedulePadEinsum") + .set_body_method(&ScheduleNode::PadEinsum); /******** (FFI) Misc ********/ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleEnterPostproc") .set_body_method(&ScheduleNode::EnterPostproc); diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc index 04ddc0507dc4..a31950d33115 100644 --- a/src/tir/schedule/traced_schedule.cc +++ b/src/tir/schedule/traced_schedule.cc @@ -520,7 +520,7 @@ void TracedScheduleNode::SetAxisSeparator(const BlockRV& block_rv, int buffer_in /*outputs=*/{})); } -/******** Schedule: Padding decomposition ********/ +/******** Schedule: Padding ********/ BlockRV TracedScheduleNode::DecomposePadding(const BlockRV& block_rv, const LoopRV& loop_rv) { BlockRV new_block = ConcreteScheduleNode::DecomposePadding(block_rv, loop_rv); static const InstructionKind& kind = InstructionKind::Get("DecomposePadding"); @@ -532,6 +532,16 @@ BlockRV TracedScheduleNode::DecomposePadding(const BlockRV& block_rv, const Loop return new_block; } +void TracedScheduleNode::PadEinsum(const BlockRV& block_rv, const Array& padding) { + ConcreteScheduleNode::PadEinsum(block_rv, padding); + static const InstructionKind& kind = InstructionKind::Get("PadEinsum"); + trace_->Append(/*inst=*/Instruction( + /*kind=*/kind, + /*inputs=*/{block_rv}, + /*attrs=*/{padding}, + /*outputs=*/{})); +} + /******** Schedule: Misc ********/ void TracedScheduleNode::EnterPostproc() { diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h index d98e4ba4bb95..ad44cc6ae552 100644 --- a/src/tir/schedule/traced_schedule.h +++ b/src/tir/schedule/traced_schedule.h @@ -108,8 +108,9 @@ class TracedScheduleNode : public ConcreteScheduleNode { void SetAxisSeparator(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type, const Array& axis_separators) final; - /******** Schedule: Padding decomposition ********/ + /******** Schedule: Padding ********/ BlockRV DecomposePadding(const BlockRV& block_rv, const LoopRV& loop_rv) final; + void PadEinsum(const BlockRV& block_rv, const Array& padding) final; /******** Schedule: Misc ********/ void EnterPostproc() final; }; diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc index c11fa656d6da..dfbd3dbcbcc4 100644 --- a/src/tir/schedule/transform.cc +++ b/src/tir/schedule/transform.cc @@ -103,6 +103,14 @@ ReplaceBufferMutator::ReplaceBufferMutator(const Buffer& old_buffer, Buffer new_ buffer_var_map_[old_buffer->data.get()] = std::move(new_buffer); } +ReplaceBufferMutator::ReplaceBufferMutator(const Map& buffer_map, + Map* block_sref_reuse) + : block_sref_reuse_(block_sref_reuse) { + for (const auto& [old_buffer, new_buffer] : buffer_map) { + buffer_var_map_[old_buffer->data.get()] = new_buffer; + } +} + PrimExpr ReplaceBufferMutator::VisitExpr_(const VarNode* var) { auto it = buffer_var_map_.find(var); return it != buffer_var_map_.end() ? it->second->data : GetRef(var); diff --git a/src/tir/schedule/transform.h b/src/tir/schedule/transform.h index 908a823c2d86..4de3685e2482 100644 --- a/src/tir/schedule/transform.h +++ b/src/tir/schedule/transform.h @@ -114,7 +114,12 @@ class ReplaceBufferMutator : public StmtExprMutator { ReplaceBufferMutator(const Buffer& old_buffer, Buffer new_buffer, Map* block_sref_reuse); + ReplaceBufferMutator(const Map& buffer_map, Map* block_sref_reuse); + protected: + using StmtExprMutator::VisitExpr_; + using StmtExprMutator::VisitStmt_; + PrimExpr VisitExpr_(const VarNode* var) final; template @@ -132,7 +137,7 @@ class ReplaceBufferMutator : public StmtExprMutator { virtual MatchBufferRegion VisitMatchBufferRegion(const MatchBufferRegion& match_buffer); - Stmt VisitStmt_(const BlockNode* block) final; + Stmt VisitStmt_(const BlockNode* block) override; /*! * \brief A mapping which maps old buffer vars to new buffers, including the buffers defined in diff --git a/tests/python/unittest/test_tir_schedule_pad_einsum.py b/tests/python/unittest/test_tir_schedule_pad_einsum.py new file mode 100644 index 000000000000..89628db4ff74 --- /dev/null +++ b/tests/python/unittest/test_tir_schedule_pad_einsum.py @@ -0,0 +1,122 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-function-docstring,missing-module-docstring +import sys + +import pytest +import tvm +import tvm.testing +from tvm import tir, te +from tvm.script import tir as T +from tvm.tir.schedule.schedule import ScheduleError +from tvm.tir.schedule.testing import verify_trace_roundtrip +from tvm.meta_schedule.testing import te_workload + +# pylint: disable=no-member,invalid-name,unused-variable,unexpected-keyword-arg + + +@T.prim_func +def matmul_before( + A: T.Buffer[(128, 127), "float32"], + B: T.Buffer[(127, 127), "float32"], + C: T.Buffer[(128, 127), "float32"], +) -> None: + A_shared = T.alloc_buffer((128, 127), "float32", scope="shared") + B_shared = T.alloc_buffer((127, 127), "float32", scope="shared") + C_shared = T.alloc_buffer((128, 127), "float32", scope="shared") + for i0, i1 in T.grid(128, 127): + with T.block("A"): + i, j = T.axis.remap("SS", [i0, i1]) + A_shared[i, j] = A[i, j] + for i0, i1 in T.grid(127, 127): + with T.block("B"): + i, j = T.axis.remap("SS", [i0, i1]) + B_shared[i, j] = B[i, j] + for i0, i1, i2 in T.grid(128, 127, 127): + with T.block("C_shared"): + i, j, k = T.axis.remap("SSR", [i0, i1, i2]) + with T.init(): + C_shared[i, j] = T.float32(0) + C_shared[i, j] = C_shared[i, j] + A_shared[i, k] * B_shared[k, j] + for i0, i1 in T.grid(128, 127): + with T.block("C"): + i, j = T.axis.remap("SS", [i0, i1]) + C[i, j] = C_shared[i, j] + + +@T.prim_func +def matmul_expected( + A: T.Buffer[(128, 127), "float32"], + B: T.Buffer[(127, 127), "float32"], + C: T.Buffer[(128, 127), "float32"], +) -> None: + A_shared_padded = T.alloc_buffer([128, 128], dtype="float32", scope="shared") + B_shared_padded = T.alloc_buffer([128, 128], dtype="float32", scope="shared") + C_shared_padded = T.alloc_buffer([128, 128], dtype="float32", scope="shared") + for i0, i1 in T.grid(128, 128): + with T.block("A"): + i, j = T.axis.remap("SS", [i0, i1]) + T.reads(A[i, j]) + T.writes(A_shared_padded[i, j]) + A_shared_padded[i, j] = T.if_then_else(j < 127, A[i, j], T.float32(0), dtype="float32") + for i0, i1 in T.grid(128, 128): + with T.block("B"): + i, j = T.axis.remap("SS", [i0, i1]) + T.reads(B[i, j]) + T.writes(B_shared_padded[i, j]) + B_shared_padded[i, j] = T.if_then_else( + i < 127 and j < 127, B[i, j], T.float32(0), dtype="float32" + ) + for i0, i1, i2 in T.grid(128, 128, 128): + with T.block("C_shared"): + i, j, k = T.axis.remap("SSR", [i0, i1, i2]) + T.reads(A_shared_padded[i, k], B_shared_padded[k, j]) + T.writes(C_shared_padded[i, j]) + with T.init(): + C_shared_padded[i, j] = T.float32(0) + C_shared_padded[i, j] = ( + C_shared_padded[i, j] + A_shared_padded[i, k] * B_shared_padded[k, j] + ) + for i0, i1 in T.grid(128, 127): + with T.block("C"): + i, j = T.axis.remap("SS", [i0, i1]) + T.reads(C_shared_padded[i, j]) + T.writes(C[i, j]) + C[i, j] = C_shared_padded[i, j] + + +# pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg + + +def test_pad_matmul(): + sch = tir.Schedule(matmul_before, debug_mask="all") + C = sch.get_block("C_shared") + sch.pad_einsum(C, [0, 1, 1]) + tvm.ir.assert_structural_equal(matmul_expected, sch.mod["main"]) + verify_trace_roundtrip(sch, mod=matmul_before) + + +def test_pad_matmul_error_non_intermediate_buffer(): + func = te.create_prim_func(te_workload.matmul(128, 127, 127)) + sch = tir.Schedule(func, debug_mask="all") + C = sch.get_block("C") + with pytest.raises(ScheduleError): + sch.pad_einsum(C, [0, 1, 1]) + + +if __name__ == "__main__": + tvm.testing.main() From 9b1042585effaad047808520158260a33f3f0f75 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 15 Sep 2022 13:30:08 -0700 Subject: [PATCH 171/704] [Arith] Simplify nested if_then_else (#12749) [Arith] Simplify nested if_then_else Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com> --- src/arith/rewrite_simplify.cc | 20 +++++++++++++++++++ .../unittest/test_arith_rewrite_simplify.py | 10 ++++++++++ 2 files changed, 30 insertions(+) diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc index e3e9db62d0bd..2f7b88dfc508 100644 --- a/src/arith/rewrite_simplify.cc +++ b/src/arith/rewrite_simplify.cc @@ -1654,6 +1654,26 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) { } } + if (op->op.same_as(tir::builtin::if_then_else())) { + // Simplify nested if_then_else + // if (cond) { if (inner_cond) { inner_then_expr } else { inner_else_expr } } else { else_expr } + // => if (cond && inner_cond) { inner_then_expr } else { else_expr } + const PrimExpr& cond = op->args[0]; + const PrimExpr& then_expr = op->args[1]; + const PrimExpr& else_expr = op->args[2]; + const CallNode* inner_call = then_expr.as(); + if (inner_call != nullptr && inner_call->op.same_as(tir::builtin::if_then_else())) { + const PrimExpr& inner_cond = inner_call->args[0]; + const PrimExpr& inner_then_expr = inner_call->args[1]; + const PrimExpr& inner_else_expr = inner_call->args[2]; + // Only check constant cases to avoid recursion + if (is_const_number(inner_else_expr) && is_const_number(else_expr) && + analyzer_->CanProve(inner_else_expr == else_expr)) { + return if_then_else(cond && inner_cond, inner_then_expr, else_expr); + } + } + } + return ret; } diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py index c880f90ddffe..77751b160177 100644 --- a/tests/python/unittest/test_arith_rewrite_simplify.py +++ b/tests/python/unittest/test_arith_rewrite_simplify.py @@ -992,5 +992,15 @@ def test_sub_bufferload(): ck.verify(expr, 0.0) +def test_if_then_else_simplify(): + ck = RewriteChecker() + x = te.var("x", "int32") + z = tvm.tir.if_then_else(x < 5, tvm.tir.if_then_else(x > 1, 1, 0), 0) + ck.verify(z, tvm.tir.if_then_else(tvm.tir.And(tvm.tir.LT(x, 5), tvm.tir.LT(1, x)), 1, 0)) + + z = tvm.tir.if_then_else(x > 2, tvm.tir.if_then_else(x > 1, 1, 0), 0) + ck.verify(z, tvm.tir.if_then_else(tvm.tir.LT(2, x), 1, 0)) + + if __name__ == "__main__": pytest.main([__file__]) From f5517d4a08342e66fc70ba557930abc83f5cb20b Mon Sep 17 00:00:00 2001 From: Philipp van Kempen Date: Thu, 15 Sep 2022 23:02:22 +0200 Subject: [PATCH 172/704] [Docker][CI][RISC-V] Build riscv-isa-sim (spike) in ci_riscv Docker image to enable RISC-V unit testing (#12534) * Remove CSI-NN from ci_cortexm docker image * [Docker] [RISC-V] Split up CSI-NN2 installation script into several files [Docker] [RISC-V] move gcc toolchain installation out of csi-nn2 script [Docker] [RISC-V] move qemu installation out of csi-nn2 script * use updated version of qemu * [Docker] [RISC-V] Install newlib (baremetal) gcc toolchain * [Docker] [RISC-V] Install spike simulator * [Docker] move initialization of timezone and DEBIAN_FRONTEND to ubuntu_install_core.sh script --- docker/Dockerfile.ci_cortexm | 6 -- docker/Dockerfile.ci_riscv | 24 +++++- .../ubuntu_download_csinn2_compute_lib.sh | 20 ++--- .../ubuntu_download_xuantie_gcc_linux.sh | 57 +++++++++++++ .../ubuntu_download_xuantie_gcc_newlib.sh | 57 +++++++++++++ .../install/ubuntu_download_xuantie_qemu.sh | 56 +++++++++++++ docker/install/ubuntu_install_arduino.sh | 1 - docker/install/ubuntu_install_core.sh | 5 ++ docker/install/ubuntu_install_spike_sim.sh | 81 +++++++++++++++++++ docker/install/ubuntu_install_zephyr.sh | 5 -- 10 files changed, 288 insertions(+), 24 deletions(-) create mode 100755 docker/install/ubuntu_download_xuantie_gcc_linux.sh create mode 100755 docker/install/ubuntu_download_xuantie_gcc_newlib.sh create mode 100755 docker/install/ubuntu_download_xuantie_qemu.sh create mode 100755 docker/install/ubuntu_install_spike_sim.sh diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm index 6ca2f2f40b75..a62ea059fa8c 100644 --- a/docker/Dockerfile.ci_cortexm +++ b/docker/Dockerfile.ci_cortexm @@ -110,11 +110,5 @@ RUN bash /install/ubuntu_install_ethosu_driver_stack.sh COPY install/ubuntu_install_vela.sh /install/ubuntu_install_vela.sh RUN bash /install/ubuntu_install_vela.sh -#Install CSI-NN2 -COPY install/ubuntu_download_csinn2_compute_lib.sh /install/ubuntu_download_csinn2_compute_lib.sh -RUN bash /install/ubuntu_download_csinn2_compute_lib.sh - # Update PATH ENV PATH /opt/arm/gcc-arm-none-eabi/bin:/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4:$PATH -ENV PATH /opt/csi-nn2/tools/gcc-toolchain/bin:$PATH -ENV PATH /opt/csi-nn2/tools/qemu/bin:$PATH diff --git a/docker/Dockerfile.ci_riscv b/docker/Dockerfile.ci_riscv index 9b956d55ddaa..5c597135ee41 100644 --- a/docker/Dockerfile.ci_riscv +++ b/docker/Dockerfile.ci_riscv @@ -84,10 +84,28 @@ COPY install/ubuntu_install_zephyr_sdk.sh /install/ubuntu_install_zephyr_sdk.sh RUN bash /install/ubuntu_install_zephyr.sh ENV ZEPHYR_BASE=/opt/zephyrproject/zephyr -#Install CSI-NN2 +# Download RISC-V gcc toolchain (linux) +COPY install/ubuntu_download_xuantie_gcc_linux.sh /install/ubuntu_download_xuantie_gcc_linux.sh +RUN bash /install/ubuntu_download_xuantie_gcc_linux.sh /opt/riscv/riscv64-unknown-linux-gnu + +# Download RISC-V gcc toolchain (baremetal) +COPY install/ubuntu_download_xuantie_gcc_newlib.sh /install/ubuntu_download_xuantie_gcc_newlib.sh +RUN bash /install/ubuntu_download_xuantie_gcc_newlib.sh /opt/riscv/riscv64-unknown-elf + +# Install RISC-V QEMU +COPY install/ubuntu_download_xuantie_qemu.sh /install/ubuntu_download_xuantie_qemu.sh +RUN bash /install/ubuntu_download_xuantie_qemu.sh /opt/riscv/qemu/ + +# Install CSI-NN2 COPY install/ubuntu_download_csinn2_compute_lib.sh /install/ubuntu_download_csinn2_compute_lib.sh RUN bash /install/ubuntu_download_csinn2_compute_lib.sh +# Build spike (riscv-isa-sim) and proxy kernel (pk) +COPY install/ubuntu_install_spike_sim.sh /install/ubuntu_install_spike_sim.sh +RUN bash /install/ubuntu_install_spike_sim.sh /opt/riscv/riscv64-unknown-elf/ + # Update PATH -ENV PATH /opt/csi-nn2/tools/gcc-toolchain/bin:$PATH -ENV PATH /opt/csi-nn2/tools/qemu/bin:$PATH +ENV PATH /opt/riscv/riscv64-unknown-linux-gnu/bin:$PATH +ENV PATH /opt/riscv/riscv64-unknown-elf/bin:$PATH +ENV PATH /opt/riscv/qemu/bin:$PATH +ENV PATH /opt/riscv/spike/bin:$PATH diff --git a/docker/install/ubuntu_download_csinn2_compute_lib.sh b/docker/install/ubuntu_download_csinn2_compute_lib.sh index 568ee4146084..4e483d173cbd 100755 --- a/docker/install/ubuntu_download_csinn2_compute_lib.sh +++ b/docker/install/ubuntu_download_csinn2_compute_lib.sh @@ -23,19 +23,21 @@ install_path="/opt/csi-nn2" # Clone CSI-NN2 Compute Library source code git clone --depth 1 --branch 1.12.2 https://github.com/T-head-Semi/csi-nn2.git ${install_path} -# download cross-compiler when not building natively. -# riscv gcc toolchain will be downloaded to "/path/csi-nn2/tools/gcc-toolchain". +# The toolchain is downloaded in: ubuntu_download_xuantie_gcc_linux.sh cd ${install_path} -./script/download_toolchain.sh - -# download custom QEMU to "/path/csi-nn2/tools/qemu". -./script/download_qemu.sh # build csinn2 lib for x86 and c906 # lib will be installed in /path/csi-nn2/install + # for x86 -make -j4; cd x86_build; make install; cd - +make -j4 +cd x86_build +make install +cd - + # for c906 mkdir -p riscv_build; cd riscv_build -cmake ../ -DBUILD_RISCV=ON; make -j4; make install; cd - - +export RISCV_GNU_GCC_PATH=/opt/riscv/riscv64-unknown-linux-gnu/bin +cmake ../ -DBUILD_RISCV=ON +make -j4 +make install; cd - diff --git a/docker/install/ubuntu_download_xuantie_gcc_linux.sh b/docker/install/ubuntu_download_xuantie_gcc_linux.sh new file mode 100755 index 000000000000..ab782b979785 --- /dev/null +++ b/docker/install/ubuntu_download_xuantie_gcc_linux.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +ubuntu_install_spike_sim.sh + +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e +set -u +set -o pipefail + +function show_usage() { + cat < +INSTALLATION_PATH is the installation path for the toolchain. +EOF +} + +if [ "$#" -lt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then + show_usage + exit 1 +fi + +INSTALLATION_PATH=$1 +shift + +# Create installation path directory +mkdir -p "${INSTALLATION_PATH}" + +# Download and extract RISC-V gcc +RISCV_GCC_VERSION="2.6.0" +RISCV_GCC_ID="1659325511536" +RISCV_GCC_KERNEL_VERSION="5.10.4" +RISCV_GCC_DATE="20220715" +RISCV_GCC_ARCH="x86_64" +RISCV_GCC_BASE="Xuantie-900-gcc-linux-${RISCV_GCC_KERNEL_VERSION}-glibc-${RISCV_GCC_ARCH}-V${RISCV_GCC_VERSION}-${RISCV_GCC_DATE}" +RISCV_GCC_EXT="tar.gz" +RISCV_GCC_URL="https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//${RISCV_GCC_ID}/${RISCV_GCC_BASE}.${RISCV_GCC_EXT}" +DOWNLOAD_PATH="/tmp/${RISCV_GCC_BASE}.${RISCV_GCC_EXT}" + +wget ${RISCV_GCC_URL} -O "${DOWNLOAD_PATH}" +tar -xf "${DOWNLOAD_PATH}" -C "${INSTALLATION_PATH}" --strip-components=1 +rm $DOWNLOAD_PATH +echo "SUCCESS" diff --git a/docker/install/ubuntu_download_xuantie_gcc_newlib.sh b/docker/install/ubuntu_download_xuantie_gcc_newlib.sh new file mode 100755 index 000000000000..203bc1a2f076 --- /dev/null +++ b/docker/install/ubuntu_download_xuantie_gcc_newlib.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +ubuntu_install_spike_sim.sh + +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e +set -u +set -o pipefail + +function show_usage() { + cat < +INSTALLATION_PATH is the installation path for the toolchain. +EOF +} + +if [ "$#" -lt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then + show_usage + exit 1 +fi + +INSTALLATION_PATH=$1 +shift + +# Create installation path directory +mkdir -p "${INSTALLATION_PATH}" + +# Download and extract RISC-V gcc +RISCV_GCC_VERSION="2.6.0" +RISCV_GCC_ID="1659318201401" +RISCV_GCC_DATE="20220715" +RISCV_GCC_ARCH="x86_64" +RISCV_GCC_BASE="Xuantie-900-gcc-elf-newlib-${RISCV_GCC_ARCH}-V${RISCV_GCC_VERSION}-${RISCV_GCC_DATE}" +RISCV_GCC_EXT="tar.gz" +# extra forward slash is required somehow +RISCV_GCC_URL="https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//${RISCV_GCC_ID}/${RISCV_GCC_BASE}.${RISCV_GCC_EXT}" +DOWNLOAD_PATH="/tmp/${RISCV_GCC_BASE}.${RISCV_GCC_EXT}" + +wget ${RISCV_GCC_URL} -O "${DOWNLOAD_PATH}" +tar -xf "${DOWNLOAD_PATH}" -C "${INSTALLATION_PATH}" --strip-components=1 +rm $DOWNLOAD_PATH +echo "SUCCESS" diff --git a/docker/install/ubuntu_download_xuantie_qemu.sh b/docker/install/ubuntu_download_xuantie_qemu.sh new file mode 100755 index 000000000000..56f0f3d0a34f --- /dev/null +++ b/docker/install/ubuntu_download_xuantie_qemu.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e +set -u +set -o pipefail + +function show_usage() { + cat < +INSTALLATION_PATH is the installation path for the tool. +EOF +} + +if [ "$#" -lt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then + show_usage + exit 1 +fi + +INSTALLATION_PATH=$1 + +# Create installation path directory +mkdir -p "${INSTALLATION_PATH}" + +QEMU_DATE="20220623-0307" +QEMU_SOURCE_ID="1655972947885" +QEMU_ARCH="x86_64-Ubuntu-18.04" +QEMU_BASE="xuantie-qemu-${QEMU_ARCH}-${QEMU_DATE}" +QEMU_EXT="tar.gz" +QEMU_URL="https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//${QEMU_SOURCE_ID}/${QEMU_BASE}.${QEMU_EXT}" +DOWNLOAD_PATH="/tmp/${QEMU_BASE}.${QEMU_EXT}" + +wget ${QEMU_URL} -O "${DOWNLOAD_PATH}" +tar -xf "${DOWNLOAD_PATH}" -C "${INSTALLATION_PATH}" --strip-components=1 +rm $DOWNLOAD_PATH + +# Remove non riscv64 binaries? (TODO) +# ls $INSTALLATION_PATH/bin | grep -v qemu-riscv64 | xargs -i rm -rf $INSTALLATION_PATH/bin/{} +# ls $INSTALLATION_PATH | grep -v bin | xargs -i rm -rf $INSTALLATION_PATH/{} + +echo "SUCCESS" diff --git a/docker/install/ubuntu_install_arduino.sh b/docker/install/ubuntu_install_arduino.sh index 107b452f8d3f..15dbd20fa758 100755 --- a/docker/install/ubuntu_install_arduino.sh +++ b/docker/install/ubuntu_install_arduino.sh @@ -20,7 +20,6 @@ set -e set -u set -o pipefail -export DEBIAN_FRONTEND=noninteractive apt-install-and-clear -y ca-certificates ARDUINO_CLI_VERSION="0.21.1" diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh index d20eeeba6998..a27c45433115 100755 --- a/docker/install/ubuntu_install_core.sh +++ b/docker/install/ubuntu_install_core.sh @@ -22,6 +22,11 @@ set -u set -x set -o pipefail +export DEBIAN_FRONTEND=noninteractive +export TZ=Etc/UTC +ln -snf /usr/share/zoneinfo/$TZ /etc/localtime +echo $TZ > /etc/timezone + # install libraries for building c++ core on ubuntu apt-get update && apt-install-and-clear -y --no-install-recommends \ apt-transport-https \ diff --git a/docker/install/ubuntu_install_spike_sim.sh b/docker/install/ubuntu_install_spike_sim.sh new file mode 100755 index 000000000000..24a11d758c38 --- /dev/null +++ b/docker/install/ubuntu_install_spike_sim.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +ubuntu_install_spike_sim.sh + +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e +set -u +set -o pipefail +set -x + +function show_usage() { + cat < +RISCV_PATH is the installation path of the risc-v gcc. +EOF +} + +if [ "$#" -lt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then + show_usage + exit -1 +fi + +export RISCV=$1 +export PATH=$RISCV/bin:$PATH +shift + +sudo apt-install-and-clear -y --no-install-recommends device-tree-compiler + +# Install spike +mkdir /tmp/spike +cd /tmp/spike +# TODO: freeze version? +git clone https://github.com/riscv/riscv-isa-sim.git +pushd riscv-isa-sim +mkdir build +cd build +../configure --prefix=$RISCV --with-isa=RV32IMAC +make -j`nproc` +make install +popd + +# Install pk +git clone https://github.com/riscv/riscv-pk.git +pushd riscv-pk + +# rv32imac +mkdir build +pushd build +../configure --prefix=`pwd`/install --host=riscv64-unknown-elf --with-arch=rv32imac +make -j`nproc` +make install +cp ./pk $RISCV/riscv64-unknown-elf/bin/pk +popd + +git status + +# rv64imac +mkdir build64 +pushd build64 +../configure --prefix=`pwd`/install --host=riscv64-unknown-elf --with-arch=rv64imac +make -j`nproc` +make install +cp ./pk $RISCV/riscv64-unknown-elf/bin/pk64 + +# cleanup +rm -rf /tmp/spike diff --git a/docker/install/ubuntu_install_zephyr.sh b/docker/install/ubuntu_install_zephyr.sh index f955a7ff9b19..552ad2626029 100755 --- a/docker/install/ubuntu_install_zephyr.sh +++ b/docker/install/ubuntu_install_zephyr.sh @@ -21,11 +21,6 @@ set -u set -o pipefail set -x -export DEBIAN_FRONTEND=noninteractive -export TZ=Etc/UTC -sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime -echo $TZ > /etc/timezone - sudo apt-install-and-clear -y --no-install-recommends \ libsdl2-dev ca-certificates gnupg software-properties-common wget \ git cmake ninja-build gperf \ From c9002509f6a16ea04e711d151973fe4bcce6a365 Mon Sep 17 00:00:00 2001 From: Christopher Sidebottom Date: Thu, 15 Sep 2022 22:03:21 +0100 Subject: [PATCH 173/704] [Target] Print deprecation warning before canonicalisation in build module (#12747) Hopefully fixes #12742, as the warning should only be printed when a user passes `target_host`, in the current case if the user passes `None` as `target_host` it'll be processed by `canon_target_map_and_host` which seems to always produce a `target_host` and thus triggering the warning despite the user doing nothing wrong. --- python/tvm/driver/build_module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py index 54db421e1be0..9389e7fbee60 100644 --- a/python/tvm/driver/build_module.py +++ b/python/tvm/driver/build_module.py @@ -258,8 +258,6 @@ def build( raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.") annotated_mods[tar] = mod.with_attr("runtime", runtime) - annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods, target_host) - # TODO(mbs): Both CompilationConfig and TIRToRuntime implement the same host target # defaulting logic, but there's currently no way to get back the decided host. if target_host is not None: @@ -267,6 +265,8 @@ def build( "target_host parameter is going to be deprecated. " "Please pass in tvm.target.Target(target, host=target_host) instead." ) + + annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods, target_host) if not target_host: for tar, mod in annotated_mods.items(): device_type = ndarray.device(tar.kind.name, 0).device_type From c00ce572c299a1cba6aede525be738c617e15325 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Thu, 15 Sep 2022 15:20:24 -0700 Subject: [PATCH 174/704] [ci] Add retries to docker push (#12773) This should mitigate failures like in https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/4274/pipeline. This also moves the `retry` function to a script now that we have PR #12604. Co-authored-by: driazati --- Jenkinsfile | 1555 +++--------------------------- ci/jenkins/Build.groovy.j2 | 18 +- ci/jenkins/Deploy.groovy.j2 | 6 +- ci/jenkins/DockerBuild.groovy.j2 | 6 +- ci/jenkins/Prepare.groovy.j2 | 6 +- ci/jenkins/Test.groovy.j2 | 2 +- ci/jenkins/macros.j2 | 34 +- ci/scripts/retry.sh | 39 + 8 files changed, 223 insertions(+), 1443 deletions(-) create mode 100644 ci/scripts/retry.sh diff --git a/Jenkinsfile b/Jenkinsfile index ed1cf4b09e6e..5835100fde3e 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-09-01T11:52:42.195970 +// Generated at 2022-09-14T11:22:31.582192 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -145,26 +145,7 @@ def init_git() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done -} - + . ci/scripts/retry.sh retry 3 timeout 5m git submodule update --init -f --jobs 0 """, label: 'Update git submodules', @@ -196,27 +177,8 @@ def docker_init(image) { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done -} - - retry 3 docker pull ${image} + . ci/scripts/retry.sh + retry 5 docker pull ${image} """, label: 'Pull docker image', ) @@ -453,8 +415,9 @@ def ecr_push(full_name) { sh( script: """ set -x + . ci/scripts/retry.sh docker tag ${full_name} \$AWS_ECR_REPO/${full_name} - docker push \$AWS_ECR_REPO/${full_name} + retry 5 docker push \$AWS_ECR_REPO/${full_name} """, label: 'Upload image to ECR' ) @@ -495,7 +458,8 @@ def ecr_pull(full_name) { sh( script: """ set -eux - docker pull ${full_name} + . ci/scripts/retry.sh + retry 5 docker pull ${full_name} """, label: 'Pull image from ECR' ) @@ -649,8 +613,8 @@ def lint() { 'Lint 1 of 2': { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") { - docker_init(ci_lint) init_git() + docker_init(ci_lint) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'TVM_NUM_SHARDS=2', @@ -669,8 +633,8 @@ def lint() { 'Lint 2 of 2': { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") { - docker_init(ci_lint) init_git() + docker_init(ci_lint) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'TVM_NUM_SHARDS=2', @@ -771,33 +735,14 @@ stage('Build') { if (!skip_ci) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-gpu") { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build" make("${ci_gpu} --no-gpu", 'build', '-j2') sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh md5sum build/libtvm.so retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu/build/libtvm.so md5sum build/libvta_fsim.so @@ -818,26 +763,7 @@ stage('Build') { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh md5sum build/libtvm.so retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu2/build/libtvm.so md5sum build/libvta_fsim.so @@ -858,8 +784,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu") { - docker_init(ci_cpu) init_git() + docker_init(ci_cpu) sh ( script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build", label: 'Create CPU cmake config', @@ -868,26 +794,7 @@ stage('Build') { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh md5sum build/libvta_tsim.so retry 3 aws s3 cp --no-progress build/libvta_tsim.so s3://${s3_prefix}/cpu/build/libvta_tsim.so md5sum build/libtvm.so @@ -918,8 +825,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu-minimal") { - docker_init(ci_minimal) init_git() + docker_init(ci_minimal) sh ( script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal.sh build", label: 'Create CPU minimal cmake config', @@ -928,26 +835,7 @@ stage('Build') { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh md5sum build/libtvm.so retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/cpu-minimal/build/libtvm.so md5sum build/libtvm_runtime.so @@ -968,8 +856,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-wasm") { - docker_init(ci_wasm) init_git() + docker_init(ci_wasm) sh ( script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build", label: 'Create WASM cmake config', @@ -993,8 +881,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-i386") { - docker_init(ci_i386) init_git() + docker_init(ci_i386) sh ( script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build", label: 'Create i386 cmake config', @@ -1003,26 +891,7 @@ stage('Build') { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh md5sum build/libvta_tsim.so retry 3 aws s3 cp --no-progress build/libvta_tsim.so s3://${s3_prefix}/i386/build/libvta_tsim.so md5sum build/libtvm.so @@ -1047,8 +916,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('ARM-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-arm") { - docker_init(ci_arm) init_git() + docker_init(ci_arm) sh ( script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build", label: 'Create ARM cmake config', @@ -1057,26 +926,7 @@ stage('Build') { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh md5sum build/libtvm.so retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/arm/build/libtvm.so md5sum build/libvta_fsim.so @@ -1099,8 +949,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cortexm") { - docker_init(ci_cortexm) init_git() + docker_init(ci_cortexm) sh ( script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_config_build_cortexm.sh build", label: 'Create Cortex-M cmake config', @@ -1109,26 +959,7 @@ stage('Build') { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh md5sum build/libtvm.so retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/cortexm/build/libtvm.so md5sum build/libtvm_runtime.so @@ -1150,8 +981,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-hexagon") { - docker_init(ci_hexagon) init_git() + docker_init(ci_hexagon) sh ( script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build", label: 'Create Hexagon cmake config', @@ -1164,26 +995,7 @@ stage('Build') { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh md5sum build/libtvm.so retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/hexagon/build/libtvm.so md5sum build/libtvm_runtime.so @@ -1205,8 +1017,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-riscv") { - docker_init(ci_riscv) init_git() + docker_init(ci_riscv) sh ( script: "${docker_run} ${ci_riscv} ./tests/scripts/task_config_build_riscv.sh build", label: 'Create RISC-V cmake config', @@ -1215,26 +1027,7 @@ stage('Build') { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh md5sum build/libtvm.so retry 3 aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/riscv/build/libtvm.so md5sum build/libtvm_runtime.so @@ -1266,8 +1059,8 @@ def shard_run_unittest_GPU_1_of_3() { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") { try { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', @@ -1278,26 +1071,7 @@ def shard_run_unittest_GPU_1_of_3() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libvta_fsim.so build/libvta_fsim.so @@ -1315,26 +1089,7 @@ def shard_run_unittest_GPU_1_of_3() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so @@ -1382,8 +1137,8 @@ def shard_run_unittest_GPU_2_of_3() { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") { try { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', @@ -1394,26 +1149,7 @@ def shard_run_unittest_GPU_2_of_3() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so @@ -1464,8 +1200,8 @@ def shard_run_unittest_GPU_3_of_3() { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") { try { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', @@ -1476,26 +1212,7 @@ def shard_run_unittest_GPU_3_of_3() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so @@ -1543,8 +1260,8 @@ def shard_run_integration_CPU_1_of_4() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { try { - docker_init(ci_cpu) init_git() + docker_init(ci_cpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', @@ -1555,26 +1272,7 @@ def shard_run_integration_CPU_1_of_4() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so md5sum build/libvta_tsim.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so @@ -1619,8 +1317,8 @@ def shard_run_integration_CPU_2_of_4() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { try { - docker_init(ci_cpu) init_git() + docker_init(ci_cpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', @@ -1631,26 +1329,7 @@ def shard_run_integration_CPU_2_of_4() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so md5sum build/libvta_tsim.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so @@ -1695,8 +1374,8 @@ def shard_run_integration_CPU_3_of_4() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { try { - docker_init(ci_cpu) init_git() + docker_init(ci_cpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', @@ -1707,26 +1386,7 @@ def shard_run_integration_CPU_3_of_4() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so md5sum build/libvta_tsim.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so @@ -1771,8 +1431,8 @@ def shard_run_integration_CPU_4_of_4() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { try { - docker_init(ci_cpu) init_git() + docker_init(ci_cpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', @@ -1783,26 +1443,7 @@ def shard_run_integration_CPU_4_of_4() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so md5sum build/libvta_tsim.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so @@ -1848,8 +1489,8 @@ def shard_run_python_i386_1_of_3() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") { try { - docker_init(ci_i386) init_git() + docker_init(ci_i386) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=i386', @@ -1860,26 +1501,7 @@ def shard_run_python_i386_1_of_3() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so @@ -1924,8 +1546,8 @@ def shard_run_python_i386_2_of_3() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") { try { - docker_init(ci_i386) init_git() + docker_init(ci_i386) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=i386', @@ -1936,26 +1558,7 @@ def shard_run_python_i386_2_of_3() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so @@ -2000,8 +1603,8 @@ def shard_run_python_i386_3_of_3() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") { try { - docker_init(ci_i386) init_git() + docker_init(ci_i386) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=i386', @@ -2012,26 +1615,7 @@ def shard_run_python_i386_3_of_3() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so @@ -2076,8 +1660,8 @@ def shard_run_test_Hexagon_1_of_8() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { - docker_init(ci_hexagon) init_git() + docker_init(ci_hexagon) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', @@ -2088,26 +1672,7 @@ def shard_run_test_Hexagon_1_of_8() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so @@ -2151,8 +1716,8 @@ def shard_run_test_Hexagon_2_of_8() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { - docker_init(ci_hexagon) init_git() + docker_init(ci_hexagon) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', @@ -2163,26 +1728,7 @@ def shard_run_test_Hexagon_2_of_8() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so @@ -2225,8 +1771,8 @@ def shard_run_test_Hexagon_3_of_8() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { - docker_init(ci_hexagon) init_git() + docker_init(ci_hexagon) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', @@ -2237,26 +1783,7 @@ def shard_run_test_Hexagon_3_of_8() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so @@ -2299,8 +1826,8 @@ def shard_run_test_Hexagon_4_of_8() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { - docker_init(ci_hexagon) init_git() + docker_init(ci_hexagon) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', @@ -2311,26 +1838,7 @@ def shard_run_test_Hexagon_4_of_8() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so @@ -2373,8 +1881,8 @@ def shard_run_test_Hexagon_5_of_8() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { - docker_init(ci_hexagon) init_git() + docker_init(ci_hexagon) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', @@ -2385,26 +1893,7 @@ def shard_run_test_Hexagon_5_of_8() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so @@ -2447,8 +1936,8 @@ def shard_run_test_Hexagon_6_of_8() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { - docker_init(ci_hexagon) init_git() + docker_init(ci_hexagon) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', @@ -2459,26 +1948,7 @@ def shard_run_test_Hexagon_6_of_8() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so @@ -2521,8 +1991,8 @@ def shard_run_test_Hexagon_7_of_8() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { - docker_init(ci_hexagon) init_git() + docker_init(ci_hexagon) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', @@ -2533,26 +2003,7 @@ def shard_run_test_Hexagon_7_of_8() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so @@ -2595,8 +2046,8 @@ def shard_run_test_Hexagon_8_of_8() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { try { - docker_init(ci_hexagon) init_git() + docker_init(ci_hexagon) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=hexagon', @@ -2607,26 +2058,7 @@ def shard_run_test_Hexagon_8_of_8() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so @@ -2670,8 +2102,8 @@ def shard_run_integration_aarch64_1_of_4() { node('ARM-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { try { - docker_init(ci_arm) init_git() + docker_init(ci_arm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', @@ -2682,26 +2114,7 @@ def shard_run_integration_aarch64_1_of_4() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so @@ -2745,8 +2158,8 @@ def shard_run_integration_aarch64_2_of_4() { node('ARM-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { try { - docker_init(ci_arm) init_git() + docker_init(ci_arm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', @@ -2757,26 +2170,7 @@ def shard_run_integration_aarch64_2_of_4() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so @@ -2820,8 +2214,8 @@ def shard_run_integration_aarch64_3_of_4() { node('ARM-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { try { - docker_init(ci_arm) init_git() + docker_init(ci_arm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', @@ -2832,26 +2226,7 @@ def shard_run_integration_aarch64_3_of_4() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so @@ -2895,8 +2270,8 @@ def shard_run_integration_aarch64_4_of_4() { node('ARM-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { try { - docker_init(ci_arm) init_git() + docker_init(ci_arm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', @@ -2907,26 +2282,7 @@ def shard_run_integration_aarch64_4_of_4() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so @@ -2971,8 +2327,8 @@ def shard_run_topi_GPU_1_of_3() { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") { try { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', @@ -2983,26 +2339,7 @@ def shard_run_topi_GPU_1_of_3() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so @@ -3045,8 +2382,8 @@ def shard_run_topi_GPU_2_of_3() { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") { try { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', @@ -3057,26 +2394,7 @@ def shard_run_topi_GPU_2_of_3() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so @@ -3119,8 +2437,8 @@ def shard_run_topi_GPU_3_of_3() { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") { try { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', @@ -3131,26 +2449,7 @@ def shard_run_topi_GPU_3_of_3() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so @@ -3194,8 +2493,8 @@ def shard_run_frontend_GPU_1_of_6() { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { try { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', @@ -3206,26 +2505,7 @@ def shard_run_frontend_GPU_1_of_6() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so @@ -3268,8 +2548,8 @@ def shard_run_frontend_GPU_2_of_6() { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { try { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', @@ -3280,26 +2560,7 @@ def shard_run_frontend_GPU_2_of_6() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so @@ -3342,8 +2603,8 @@ def shard_run_frontend_GPU_3_of_6() { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { try { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', @@ -3354,26 +2615,7 @@ def shard_run_frontend_GPU_3_of_6() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so @@ -3416,8 +2658,8 @@ def shard_run_frontend_GPU_4_of_6() { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { try { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', @@ -3428,26 +2670,7 @@ def shard_run_frontend_GPU_4_of_6() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so @@ -3490,8 +2713,8 @@ def shard_run_frontend_GPU_5_of_6() { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { try { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', @@ -3502,26 +2725,7 @@ def shard_run_frontend_GPU_5_of_6() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so @@ -3564,8 +2768,8 @@ def shard_run_frontend_GPU_6_of_6() { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") { try { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=gpu', @@ -3576,26 +2780,7 @@ def shard_run_frontend_GPU_6_of_6() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so @@ -3639,8 +2824,8 @@ def shard_run_topi_aarch64_1_of_2() { node('ARM-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { try { - docker_init(ci_arm) init_git() + docker_init(ci_arm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', @@ -3651,26 +2836,7 @@ def shard_run_topi_aarch64_1_of_2() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so @@ -3718,8 +2884,8 @@ def shard_run_topi_aarch64_2_of_2() { node('ARM-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { try { - docker_init(ci_arm) init_git() + docker_init(ci_arm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', @@ -3730,26 +2896,7 @@ def shard_run_topi_aarch64_2_of_2() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so @@ -3797,8 +2944,8 @@ def shard_run_frontend_aarch64_1_of_2() { node('ARM-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") { try { - docker_init(ci_arm) init_git() + docker_init(ci_arm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', @@ -3809,26 +2956,7 @@ def shard_run_frontend_aarch64_1_of_2() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so @@ -3871,8 +2999,8 @@ def shard_run_frontend_aarch64_2_of_2() { node('ARM-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") { try { - docker_init(ci_arm) init_git() + docker_init(ci_arm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=arm', @@ -3883,26 +3011,7 @@ def shard_run_frontend_aarch64_2_of_2() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so @@ -3946,8 +3055,8 @@ def shard_run_test_Cortex_M_1_of_12() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_cortexm) init_git() + docker_init(ci_cortexm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', @@ -3958,26 +3067,7 @@ def shard_run_test_Cortex_M_1_of_12() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so @@ -4025,8 +3115,8 @@ def shard_run_test_Cortex_M_2_of_12() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_cortexm) init_git() + docker_init(ci_cortexm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', @@ -4037,26 +3127,7 @@ def shard_run_test_Cortex_M_2_of_12() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so @@ -4099,8 +3170,8 @@ def shard_run_test_Cortex_M_3_of_12() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_cortexm) init_git() + docker_init(ci_cortexm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', @@ -4111,26 +3182,7 @@ def shard_run_test_Cortex_M_3_of_12() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so @@ -4173,8 +3225,8 @@ def shard_run_test_Cortex_M_4_of_12() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_cortexm) init_git() + docker_init(ci_cortexm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', @@ -4185,26 +3237,7 @@ def shard_run_test_Cortex_M_4_of_12() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so @@ -4247,8 +3280,8 @@ def shard_run_test_Cortex_M_5_of_12() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_cortexm) init_git() + docker_init(ci_cortexm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', @@ -4259,26 +3292,7 @@ def shard_run_test_Cortex_M_5_of_12() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so @@ -4321,8 +3335,8 @@ def shard_run_test_Cortex_M_6_of_12() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_cortexm) init_git() + docker_init(ci_cortexm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', @@ -4333,26 +3347,7 @@ def shard_run_test_Cortex_M_6_of_12() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so @@ -4395,8 +3390,8 @@ def shard_run_test_Cortex_M_7_of_12() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_cortexm) init_git() + docker_init(ci_cortexm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', @@ -4407,26 +3402,7 @@ def shard_run_test_Cortex_M_7_of_12() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so @@ -4469,8 +3445,8 @@ def shard_run_test_Cortex_M_8_of_12() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_cortexm) init_git() + docker_init(ci_cortexm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', @@ -4481,26 +3457,7 @@ def shard_run_test_Cortex_M_8_of_12() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so @@ -4543,8 +3500,8 @@ def shard_run_test_Cortex_M_9_of_12() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_cortexm) init_git() + docker_init(ci_cortexm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', @@ -4555,26 +3512,7 @@ def shard_run_test_Cortex_M_9_of_12() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so @@ -4617,8 +3555,8 @@ def shard_run_test_Cortex_M_10_of_12() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_cortexm) init_git() + docker_init(ci_cortexm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', @@ -4629,26 +3567,7 @@ def shard_run_test_Cortex_M_10_of_12() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so @@ -4691,8 +3610,8 @@ def shard_run_test_Cortex_M_11_of_12() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_cortexm) init_git() + docker_init(ci_cortexm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', @@ -4703,26 +3622,7 @@ def shard_run_test_Cortex_M_11_of_12() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so @@ -4765,8 +3665,8 @@ def shard_run_test_Cortex_M_12_of_12() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-cortexm") { try { - docker_init(ci_cortexm) init_git() + docker_init(ci_cortexm) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cortexm', @@ -4777,26 +3677,7 @@ def shard_run_test_Cortex_M_12_of_12() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cortexm/build/libtvm_runtime.so build/libtvm_runtime.so @@ -4840,8 +3721,8 @@ def shard_run_test_RISC_V_1_of_1() { node('CPU-SMALL') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-riscv") { try { - docker_init(ci_riscv) init_git() + docker_init(ci_riscv) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=riscv', @@ -4852,26 +3733,7 @@ def shard_run_test_RISC_V_1_of_1() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/riscv/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/riscv/build/libtvm_runtime.so build/libtvm_runtime.so @@ -4917,32 +3779,13 @@ def run_unittest_minimal() { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu-minimal") { timeout(time: max_time, unit: 'MINUTES') { try { - docker_init(ci_minimal) init_git() + docker_init(ci_minimal) withEnv(['PLATFORM=minimal'], { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu-minimal/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu-minimal/build/libtvm_runtime.so build/libtvm_runtime.so @@ -5134,34 +3977,15 @@ stage('Test') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") { timeout(time: max_time, unit: 'MINUTES') { try { - docker_init(ci_cpu) init_git() + docker_init(ci_cpu) withEnv(['PLATFORM=cpu', 'TEST_STEP_NAME=unittest: CPU', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so md5sum build/libvta_tsim.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so @@ -5209,34 +4033,15 @@ stage('Test') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-cpu") { timeout(time: max_time, unit: 'MINUTES') { try { - docker_init(ci_cpu) init_git() + docker_init(ci_cpu) withEnv(['PLATFORM=cpu', 'TEST_STEP_NAME=frontend: CPU', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so @@ -5277,31 +4082,12 @@ stage('Test') { if (!skip_ci) { node('GPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/docs-python-gpu") { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so md5sum build/libtvm.so retry 3 aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so @@ -5326,26 +4112,7 @@ stage('Test') { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh md5sum docs.tgz retry 3 aws s3 cp --no-progress docs.tgz s3://${s3_prefix}/docs/docs.tgz """, @@ -5395,10 +4162,11 @@ def update_docker(ecr_image, hub_image) { sh( script: """ set -eux + . ci/scripts/retry.sh docker tag \ ${ecr_image} \ ${hub_image} - docker push ${hub_image} + retry 5 docker push ${hub_image} """, label: "Update ${hub_image} on Docker Hub", ) @@ -5457,26 +4225,7 @@ def deploy() { sh( script: """ set -eux - retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done - } - + . ci/scripts/retry.sh retry 3 aws s3 cp --no-progress s3://${s3_prefix}/docs/docs.tgz docs.tgz md5sum docs.tgz """, @@ -5555,9 +4304,10 @@ def deploy() { sh( script: """ set -eux + . ci/scripts/retry.sh docker pull tlcpackstaging/ci_arm:${tag} docker tag tlcpackstaging/ci_arm:${tag} tlcpack/ci-arm:${tag} - docker push tlcpack/ci-arm:${tag} + retry 5 docker push tlcpack/ci-arm:${tag} """, label: 'Tag tlcpackstaging/ci_arm image to tlcpack', ) @@ -5568,9 +4318,10 @@ def deploy() { sh( script: """ set -eux + . ci/scripts/retry.sh docker pull tlcpackstaging/ci_cortexm:${tag} docker tag tlcpackstaging/ci_cortexm:${tag} tlcpack/ci-cortexm:${tag} - docker push tlcpack/ci-cortexm:${tag} + retry 5 docker push tlcpack/ci-cortexm:${tag} """, label: 'Tag tlcpackstaging/ci_cortexm image to tlcpack', ) @@ -5581,9 +4332,10 @@ def deploy() { sh( script: """ set -eux + . ci/scripts/retry.sh docker pull tlcpackstaging/ci_cpu:${tag} docker tag tlcpackstaging/ci_cpu:${tag} tlcpack/ci-cpu:${tag} - docker push tlcpack/ci-cpu:${tag} + retry 5 docker push tlcpack/ci-cpu:${tag} """, label: 'Tag tlcpackstaging/ci_cpu image to tlcpack', ) @@ -5594,9 +4346,10 @@ def deploy() { sh( script: """ set -eux + . ci/scripts/retry.sh docker pull tlcpackstaging/ci_gpu:${tag} docker tag tlcpackstaging/ci_gpu:${tag} tlcpack/ci-gpu:${tag} - docker push tlcpack/ci-gpu:${tag} + retry 5 docker push tlcpack/ci-gpu:${tag} """, label: 'Tag tlcpackstaging/ci_gpu image to tlcpack', ) @@ -5607,9 +4360,10 @@ def deploy() { sh( script: """ set -eux + . ci/scripts/retry.sh docker pull tlcpackstaging/ci_hexagon:${tag} docker tag tlcpackstaging/ci_hexagon:${tag} tlcpack/ci-hexagon:${tag} - docker push tlcpack/ci-hexagon:${tag} + retry 5 docker push tlcpack/ci-hexagon:${tag} """, label: 'Tag tlcpackstaging/ci_hexagon image to tlcpack', ) @@ -5620,9 +4374,10 @@ def deploy() { sh( script: """ set -eux + . ci/scripts/retry.sh docker pull tlcpackstaging/ci_i386:${tag} docker tag tlcpackstaging/ci_i386:${tag} tlcpack/ci-i386:${tag} - docker push tlcpack/ci-i386:${tag} + retry 5 docker push tlcpack/ci-i386:${tag} """, label: 'Tag tlcpackstaging/ci_i386 image to tlcpack', ) @@ -5633,9 +4388,10 @@ def deploy() { sh( script: """ set -eux + . ci/scripts/retry.sh docker pull tlcpackstaging/ci_lint:${tag} docker tag tlcpackstaging/ci_lint:${tag} tlcpack/ci-lint:${tag} - docker push tlcpack/ci-lint:${tag} + retry 5 docker push tlcpack/ci-lint:${tag} """, label: 'Tag tlcpackstaging/ci_lint image to tlcpack', ) @@ -5646,9 +4402,10 @@ def deploy() { sh( script: """ set -eux + . ci/scripts/retry.sh docker pull tlcpackstaging/ci_minimal:${tag} docker tag tlcpackstaging/ci_minimal:${tag} tlcpack/ci-minimal:${tag} - docker push tlcpack/ci-minimal:${tag} + retry 5 docker push tlcpack/ci-minimal:${tag} """, label: 'Tag tlcpackstaging/ci_minimal image to tlcpack', ) @@ -5659,9 +4416,10 @@ def deploy() { sh( script: """ set -eux + . ci/scripts/retry.sh docker pull tlcpackstaging/ci_riscv:${tag} docker tag tlcpackstaging/ci_riscv:${tag} tlcpack/ci-riscv:${tag} - docker push tlcpack/ci-riscv:${tag} + retry 5 docker push tlcpack/ci-riscv:${tag} """, label: 'Tag tlcpackstaging/ci_riscv image to tlcpack', ) @@ -5672,9 +4430,10 @@ def deploy() { sh( script: """ set -eux + . ci/scripts/retry.sh docker pull tlcpackstaging/ci_wasm:${tag} docker tag tlcpackstaging/ci_wasm:${tag} tlcpack/ci-wasm:${tag} - docker push tlcpack/ci-wasm:${tag} + retry 5 docker push tlcpack/ci-wasm:${tag} """, label: 'Tag tlcpackstaging/ci_wasm image to tlcpack', ) diff --git a/ci/jenkins/Build.groovy.j2 b/ci/jenkins/Build.groovy.j2 index 51360b7d9c54..a083fe88ad80 100644 --- a/ci/jenkins/Build.groovy.j2 +++ b/ci/jenkins/Build.groovy.j2 @@ -84,8 +84,8 @@ stage('Build') { if (!skip_ci) { node('CPU-SMALL') { ws({{ m.per_exec_ws('tvm/build-gpu') }}) { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build" make("${ci_gpu} --no-gpu", 'build', '-j2') {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }} @@ -102,8 +102,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws({{ m.per_exec_ws('tvm/build-cpu') }}) { - docker_init(ci_cpu) init_git() + docker_init(ci_cpu) sh ( script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build", label: 'Create CPU cmake config', @@ -126,8 +126,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws({{ m.per_exec_ws('tvm/build-cpu-minimal') }}) { - docker_init(ci_minimal) init_git() + docker_init(ci_minimal) sh ( script: "${docker_run} ${ci_minimal} ./tests/scripts/task_config_build_minimal.sh build", label: 'Create CPU minimal cmake config', @@ -144,8 +144,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws({{ m.per_exec_ws('tvm/build-wasm') }}) { - docker_init(ci_wasm) init_git() + docker_init(ci_wasm) sh ( script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build", label: 'Create WASM cmake config', @@ -169,8 +169,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws({{ m.per_exec_ws('tvm/build-i386') }}) { - docker_init(ci_i386) init_git() + docker_init(ci_i386) sh ( script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build", label: 'Create i386 cmake config', @@ -187,8 +187,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('ARM-SMALL') { ws({{ m.per_exec_ws('tvm/build-arm') }}) { - docker_init(ci_arm) init_git() + docker_init(ci_arm) sh ( script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build", label: 'Create ARM cmake config', @@ -205,8 +205,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws({{ m.per_exec_ws('tvm/build-cortexm') }}) { - docker_init(ci_cortexm) init_git() + docker_init(ci_cortexm) sh ( script: "${docker_run} ${ci_cortexm} ./tests/scripts/task_config_build_cortexm.sh build", label: 'Create Cortex-M cmake config', @@ -223,8 +223,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws({{ m.per_exec_ws('tvm/build-hexagon') }}) { - docker_init(ci_hexagon) init_git() + docker_init(ci_hexagon) sh ( script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build", label: 'Create Hexagon cmake config', @@ -245,8 +245,8 @@ stage('Build') { if (!skip_ci && is_docs_only_build != 1) { node('CPU-SMALL') { ws({{ m.per_exec_ws('tvm/build-riscv') }}) { - docker_init(ci_riscv) init_git() + docker_init(ci_riscv) sh ( script: "${docker_run} ${ci_riscv} ./tests/scripts/task_config_build_riscv.sh build", label: 'Create RISC-V cmake config', diff --git a/ci/jenkins/Deploy.groovy.j2 b/ci/jenkins/Deploy.groovy.j2 index 08516da41b9f..d2ee4360da6b 100644 --- a/ci/jenkins/Deploy.groovy.j2 +++ b/ci/jenkins/Deploy.groovy.j2 @@ -30,10 +30,11 @@ def update_docker(ecr_image, hub_image) { sh( script: """ set -eux + . ci/scripts/retry.sh docker tag \ ${ecr_image} \ ${hub_image} - docker push ${hub_image} + retry 5 docker push ${hub_image} """, label: "Update ${hub_image} on Docker Hub", ) @@ -144,9 +145,10 @@ def deploy() { sh( script: """ set -eux + . ci/scripts/retry.sh docker pull tlcpackstaging/{{ image.name }}:${tag} docker tag tlcpackstaging/{{ image.name }}:${tag} tlcpack/{{ image.name.replace("_", "-") }}:${tag} - docker push tlcpack/{{ image.name.replace("_", "-") }}:${tag} + retry 5 docker push tlcpack/{{ image.name.replace("_", "-") }}:${tag} """, label: 'Tag tlcpackstaging/{{ image.name }} image to tlcpack', ) diff --git a/ci/jenkins/DockerBuild.groovy.j2 b/ci/jenkins/DockerBuild.groovy.j2 index 1f3bded86242..5ffbeded80fa 100644 --- a/ci/jenkins/DockerBuild.groovy.j2 +++ b/ci/jenkins/DockerBuild.groovy.j2 @@ -21,8 +21,9 @@ def ecr_push(full_name) { sh( script: """ set -x + . ci/scripts/retry.sh docker tag ${full_name} \$AWS_ECR_REPO/${full_name} - docker push \$AWS_ECR_REPO/${full_name} + retry 5 docker push \$AWS_ECR_REPO/${full_name} """, label: 'Upload image to ECR' ) @@ -63,7 +64,8 @@ def ecr_pull(full_name) { sh( script: """ set -eux - docker pull ${full_name} + . ci/scripts/retry.sh + retry 5 docker pull ${full_name} """, label: 'Pull image from ECR' ) diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2 index 6d0c0ec9c4b6..4464108968de 100644 --- a/ci/jenkins/Prepare.groovy.j2 +++ b/ci/jenkins/Prepare.groovy.j2 @@ -33,7 +33,7 @@ def init_git() { sh( script: """ set -eux - {{ m.bash_retry() }} + . ci/scripts/retry.sh retry 3 timeout 5m git submodule update --init -f --jobs 0 """, label: 'Update git submodules', @@ -65,8 +65,8 @@ def docker_init(image) { sh( script: """ set -eux - {{ m.bash_retry() }} - retry 3 docker pull ${image} + . ci/scripts/retry.sh + retry 5 docker pull ${image} """, label: 'Pull docker image', ) diff --git a/ci/jenkins/Test.groovy.j2 b/ci/jenkins/Test.groovy.j2 index 4ed149da9be0..52ed742d4cc0 100644 --- a/ci/jenkins/Test.groovy.j2 +++ b/ci/jenkins/Test.groovy.j2 @@ -294,8 +294,8 @@ stage('Test') { if (!skip_ci) { node('GPU') { ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) { - docker_init(ci_gpu) init_git() + docker_init(ci_gpu) {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }} add_microtvm_permissions() timeout(time: 180, unit: 'MINUTES') { diff --git a/ci/jenkins/macros.j2 b/ci/jenkins/macros.j2 index 9d02ad68d6da..e6e69097b076 100644 --- a/ci/jenkins/macros.j2 +++ b/ci/jenkins/macros.j2 @@ -39,8 +39,8 @@ def {{ method_name }}() { node('{{ node }}') { ws({{ per_exec_ws(ws) }}) { try { - docker_init({{ docker_image }}) init_git() + docker_init({{ docker_image }}) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM={{ platform }}', @@ -71,8 +71,8 @@ def {{ method_name }}() { '{{ name }} {{ shard_index }} of {{ num_shards }}': { node('{{ node }}') { ws({{ per_exec_ws(ws) }}) { - docker_init({{ docker_image }}) init_git() + docker_init({{ docker_image }}) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'TVM_NUM_SHARDS={{ num_shards }}', @@ -95,8 +95,8 @@ def {{ method_name }}() { ws({{ per_exec_ws(ws) }}) { timeout(time: max_time, unit: 'MINUTES') { try { - docker_init({{ docker_image }}) init_git() + docker_init({{ docker_image }}) withEnv(['PLATFORM={{ platform }}'], { {{ caller() | indent(width=8) | trim }} }) @@ -120,8 +120,8 @@ def {{ method_name }}() { ws({{ per_exec_ws(ws) }}) { timeout(time: max_time, unit: 'MINUTES') { try { - docker_init({{ docker_image }}) init_git() + docker_init({{ docker_image }}) withEnv(['PLATFORM={{ platform }}', 'TEST_STEP_NAME={{ name }}', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { @@ -140,28 +140,6 @@ def {{ method_name }}() { }, {% endmacro %} -{% macro bash_retry() %} -retry() { - local max_retries=\$1 - shift - local n=0 - local backoff_max=30 - until [ "\$n" -ge \$max_retries ] - do - "\$@" && break - n=\$((n+1)) - if [ "\$n" -eq \$max_retries ]; then - echo "failed to update after attempt \$n / \$max_retries, giving up" - exit 1 - fi - - WAIT=\$(python3 -c 'import random; print(random.randint(10, 30))') - echo "failed to update \$n / \$max_retries, waiting \$WAIT to try again" - sleep \$WAIT - done -} -{% endmacro %} - {% macro deploy_step(name, feature_flag, ws) %} '{{ name }}': { if ({{ feature_flag }}) { @@ -182,7 +160,7 @@ retry() { sh( script: """ set -eux - {{ bash_retry() | indent(width=14) }} + . ci/scripts/retry.sh {% for filename in filenames %} md5sum {{ filename }} retry 3 aws s3 cp --no-progress {{ filename }} s3://${s3_prefix}/{{ tag }}/{{ filename }} @@ -199,7 +177,7 @@ sh( sh( script: """ set -eux - {{ bash_retry() | indent(width=14) }} + . ci/scripts/retry.sh {% for filename in filenames %} retry 3 aws s3 cp --no-progress s3://${s3_prefix}/{{ tag }}/{{ filename }} {{ filename }} md5sum {{ filename }} diff --git a/ci/scripts/retry.sh b/ci/scripts/retry.sh new file mode 100644 index 000000000000..08958fedce89 --- /dev/null +++ b/ci/scripts/retry.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -eux + +retry() { + local max_retries=$1 + shift + local n=0 + until [ "$n" -ge "$max_retries" ] + do + "$@" && break + n=$((n+1)) + if [ "$n" -eq "$max_retries" ]; then + echo "failed to update after attempt $n / $max_retries, giving up" + exit 1 + fi + + WAIT=$(python3 -c 'import random; print(random.randint(10, 30))') + echo "failed to update $n / $max_retries, waiting $WAIT to try again" + sleep "$WAIT" + done +} From 111a88d04ece6a0c6803277d5b7b1d4852b5e46c Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Thu, 15 Sep 2022 15:29:15 -0700 Subject: [PATCH 175/704] [ci][docker] Always build cmake from source (#12774) This should fix some version drift in the current cmake versions in the Docker containers (currently running all of 3.10, 3.16, 3.18, and 3.20) Co-authored-by: driazati --- docker/Dockerfile.ci_arm | 6 +++--- docker/Dockerfile.ci_cortexm | 6 +++--- docker/Dockerfile.ci_cpu | 6 +++--- docker/Dockerfile.ci_hexagon | 3 +++ docker/Dockerfile.ci_i386 | 6 +++--- docker/Dockerfile.ci_minimal | 3 +++ docker/Dockerfile.ci_riscv | 6 +++--- docker/Dockerfile.ci_wasm | 3 +++ docker/install/ubuntu_install_core.sh | 1 - 9 files changed, 24 insertions(+), 16 deletions(-) diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm index 932687f1e568..2297e8f1e6e7 100644 --- a/docker/Dockerfile.ci_arm +++ b/docker/Dockerfile.ci_arm @@ -29,6 +29,9 @@ RUN apt-install-and-clear -y ca-certificates gnupg2 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh RUN bash /install/ubuntu_install_core.sh +COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh +RUN bash /install/ubuntu_install_cmake_source.sh + COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh RUN bash /install/ubuntu_install_googletest.sh @@ -54,9 +57,6 @@ RUN bash /install/ubuntu_install_python.sh ENV PATH ${TVM_VENV}/bin:$PATH ENV PYTHONNOUSERSITE 1 # Disable .local directory from affecting CI. -COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh -RUN bash /install/ubuntu_install_cmake_source.sh - COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh RUN bash /install/ubuntu_install_python_package.sh diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm index a62ea059fa8c..db02792efda9 100644 --- a/docker/Dockerfile.ci_cortexm +++ b/docker/Dockerfile.ci_cortexm @@ -26,12 +26,12 @@ RUN apt-get update --fix-missing COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh RUN bash /install/ubuntu_install_core.sh -COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh -RUN bash /install/ubuntu_install_googletest.sh - COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh RUN bash /install/ubuntu_install_cmake_source.sh 3.20.0 +COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh +RUN bash /install/ubuntu_install_googletest.sh + ENV TVM_VENV /venv/apache-tvm-py3.7 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu index 00fd9a4fcab3..155f9ef7d914 100644 --- a/docker/Dockerfile.ci_cpu +++ b/docker/Dockerfile.ci_cpu @@ -25,6 +25,9 @@ RUN apt-get update --fix-missing COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh RUN bash /install/ubuntu_install_core.sh +COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh +RUN bash /install/ubuntu_install_cmake_source.sh + COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh RUN bash /install/ubuntu_install_googletest.sh @@ -41,9 +44,6 @@ RUN bash /install/ubuntu_install_python_package.sh COPY install/ubuntu1804_install_llvm.sh /install/ubuntu1804_install_llvm.sh RUN bash /install/ubuntu1804_install_llvm.sh -COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh -RUN bash /install/ubuntu_install_cmake_source.sh - COPY install/ubuntu_install_dnnl.sh /install/ubuntu_install_dnnl.sh RUN bash /install/ubuntu_install_dnnl.sh diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon index d2ed29278488..f1fc7be52484 100644 --- a/docker/Dockerfile.ci_hexagon +++ b/docker/Dockerfile.ci_hexagon @@ -28,6 +28,9 @@ RUN apt-install-and-clear -y ca-certificates gnupg2 libxml2-dev COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh RUN bash /install/ubuntu_install_core.sh +COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh +RUN bash /install/ubuntu_install_cmake_source.sh + COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh RUN bash /install/ubuntu_install_googletest.sh diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386 index dc767ff6def1..b37e849819be 100644 --- a/docker/Dockerfile.ci_i386 +++ b/docker/Dockerfile.ci_i386 @@ -29,6 +29,9 @@ RUN apt-install-and-clear -y ca-certificates COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh RUN bash /install/ubuntu_install_core.sh +COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh +RUN bash /install/ubuntu_install_cmake_source.sh + COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh RUN bash /install/ubuntu_install_googletest.sh @@ -49,9 +52,6 @@ COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh RUN bash /install/ubuntu_install_python.sh ENV PATH ${TVM_VENV}/bin:$PATH -COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh -RUN bash /install/ubuntu_install_cmake_source.sh - COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh RUN bash /install/ubuntu_install_python_package.sh diff --git a/docker/Dockerfile.ci_minimal b/docker/Dockerfile.ci_minimal index 974f3eea11d6..b4ba758901b4 100644 --- a/docker/Dockerfile.ci_minimal +++ b/docker/Dockerfile.ci_minimal @@ -25,6 +25,9 @@ RUN apt-get update --fix-missing COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh RUN bash /install/ubuntu_install_core.sh +COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh +RUN bash /install/ubuntu_install_cmake_source.sh + COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh RUN bash /install/ubuntu_install_googletest.sh diff --git a/docker/Dockerfile.ci_riscv b/docker/Dockerfile.ci_riscv index 5c597135ee41..0d03db15e39b 100644 --- a/docker/Dockerfile.ci_riscv +++ b/docker/Dockerfile.ci_riscv @@ -26,12 +26,12 @@ RUN apt-get update --fix-missing COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh RUN bash /install/ubuntu_install_core.sh -COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh -RUN bash /install/ubuntu_install_googletest.sh - COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh RUN bash /install/ubuntu_install_cmake_source.sh +COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh +RUN bash /install/ubuntu_install_googletest.sh + ENV TVM_VENV /venv/apache-tvm-py3.7 COPY python/bootstrap/lockfiles /install/python/bootstrap/lockfiles COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm index 17230312f041..46f64b44dab5 100644 --- a/docker/Dockerfile.ci_wasm +++ b/docker/Dockerfile.ci_wasm @@ -23,6 +23,9 @@ RUN apt-get update --fix-missing COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh RUN bash /install/ubuntu_install_core.sh +COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh +RUN bash /install/ubuntu_install_cmake_source.sh + COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh RUN bash /install/ubuntu_install_googletest.sh diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh index a27c45433115..7f26c6def25d 100755 --- a/docker/install/ubuntu_install_core.sh +++ b/docker/install/ubuntu_install_core.sh @@ -31,7 +31,6 @@ echo $TZ > /etc/timezone apt-get update && apt-install-and-clear -y --no-install-recommends \ apt-transport-https \ ca-certificates \ - cmake \ curl \ g++ \ gdb \ From 5b43c62ee64a7006dccc40811bd94de91d02a136 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Thu, 15 Sep 2022 15:43:33 -0700 Subject: [PATCH 176/704] [ci] Remove author check from ping bot (#12788) This has been working fine for a while, this code opens it up so it's not limited to the authors in #9983. Co-authored-by: driazati --- ci/scripts/ping_reviewers.py | 19 +------------------ tests/python/ci/test_ci.py | 20 -------------------- 2 files changed, 1 insertion(+), 38 deletions(-) diff --git a/ci/scripts/ping_reviewers.py b/ci/scripts/ping_reviewers.py index 0b034a795efd..af642a52a0eb 100755 --- a/ci/scripts/ping_reviewers.py +++ b/ci/scripts/ping_reviewers.py @@ -189,7 +189,6 @@ def make_ping_message(pr, reviewers): parser.add_argument("--wait-time-minutes", required=True, type=int, help="ssh remote to parse") parser.add_argument("--cutoff-pr-number", default=0, type=int, help="ssh remote to parse") parser.add_argument("--dry-run", action="store_true", help="don't update GitHub") - parser.add_argument("--allowlist", help="filter by these PR authors") parser.add_argument("--pr-json", help="(testing) data for testing to use instead of GitHub") parser.add_argument("--now", help="(testing) custom string for current time") args = parser.parse_args() @@ -208,17 +207,6 @@ def make_ping_message(pr, reviewers): end="", ) - # [slow rollout] - # This code is here to gate this feature to a limited set of people before - # deploying it for everyone to avoid spamming in the case of bugs or - # ongoing development. - if args.allowlist: - author_allowlist = args.allowlist.split(",") - else: - github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo) - allowlist_issue = github.get("issues/9983") - author_allowlist = set(find_reviewers(allowlist_issue["body"])) - if args.pr_json: r = json.loads(args.pr_json) else: @@ -242,13 +230,8 @@ def make_ping_message(pr, reviewers): print( f"Skipping #{pr['number']} since it's too old ({pr['number']} <= {cutoff_pr_number})" ) - elif pr["author"]["login"] not in author_allowlist: - # [slow rollout] - print( - f"Skipping #{pr['number']} since author {pr['author']['login']} is not in allowlist: {author_allowlist}" - ) else: - print(f"Checking #{pr['number']} since author is in {author_allowlist}") + print(f"Checking #{pr['number']}") prs_to_check.append(pr) print(f"Summary: Checking {len(prs_to_check)} of {len(prs)} fetched") diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py index 79c72ce988c3..6c25694cfc74 100644 --- a/tests/python/ci/test_ci.py +++ b/tests/python/ci/test_ci.py @@ -548,24 +548,6 @@ def all_time_keys(time): }, check="Pinging reviewers ['someone'] on https://github.com/apache/tvm/pull/123", ), - # Check allowlist functionality - allowlist=dict( - pull_request={ - "number": 123, - "url": "https://github.com/apache/tvm/pull/123", - "body": "cc @someone", - "isDraft": False, - "author": {"login": "user2"}, - "reviews": {"nodes": []}, - **all_time_keys("2022-01-18T17:54:19Z"), - "comments": { - "nodes": [ - {**all_time_keys("2022-01-19T17:54:19Z"), "bodyText": "abc"}, - ] - }, - }, - check="Checking 0 of 1 fetched", - ), # Old comment, ping old_comment=dict( pull_request={ @@ -632,8 +614,6 @@ def test_ping_reviewers(tmpdir_factory, pull_request, check): "1", "--cutoff-pr-number", "5", - "--allowlist", - "user", "--pr-json", json.dumps(data), "--now", From afad20d8d9740baa1d251b7e80e8e56a1c2b7a4d Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 15 Sep 2022 16:38:38 -0700 Subject: [PATCH 177/704] Fix typo in doc of logging (#12798) --- include/tvm/runtime/logging.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/tvm/runtime/logging.h b/include/tvm/runtime/logging.h index 7b635eab0422..7dbc6d810dc0 100644 --- a/include/tvm/runtime/logging.h +++ b/include/tvm/runtime/logging.h @@ -476,12 +476,12 @@ inline bool DebugLoggingEnabled() { * * To enable file \p relay/foo.cc up to level 2 and \p ir/bar.cc for level 0 only set: * \code - * TVM_LOG_DEBUG="relay/foo.cc=2;ir/bar.cc=0" + * TVM_LOG_DEBUG="relay/foo.cc=2,ir/bar.cc=0" * \endcode * * To enable all files up to level 3 but disable \p ir/bar.cc set: * \code - * TVM_LOG_DEBUG="DEFAULT=2;ir/bar.cc=-1" + * TVM_LOG_DEBUG="DEFAULT=2,ir/bar.cc=-1" * \endcode * * Any of these settings will also enable DLOG statements. From 6a051843a9af11261cc0103837f517db14066fc5 Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Thu, 15 Sep 2022 16:39:58 -0700 Subject: [PATCH 178/704] [TVMScript] IRBuilder methods for `For` (#12786) This PR introduces remaining IRBuilder methods for `For`. Co-authored-by: yongwww --- include/tvm/script/ir_builder/tir/frame.h | 51 ++++++ include/tvm/script/ir_builder/tir/ir.h | 53 ++++++ python/tvm/script/ir_builder/tir/frame.py | 9 + python/tvm/script/ir_builder/tir/ir.py | 172 ++++++++++++++++++ src/script/ir_builder/tir/frame.cc | 6 + src/script/ir_builder/tir/ir.cc | 76 ++++++++ .../unittest/test_tvmscript_ir_builder_tir.py | 55 ++++++ 7 files changed, 422 insertions(+) diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h index 15ab77863e5e..2902b982d5a6 100644 --- a/include/tvm/script/ir_builder/tir/frame.h +++ b/include/tvm/script/ir_builder/tir/frame.h @@ -187,6 +187,57 @@ class BlockFrame : public TIRFrame { TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(BlockFrame, TIRFrame, BlockFrameNode); }; +/*! + * \brief A frame that represents the for loop. + * + * \sa ForFrame + */ +class ForFrameNode : public TIRFrameNode { + public: + /*! + * \brief Functions that generate loop nests. + * \param loop_vars The loop variables, from outer to inner + * \param loop_extents The loop extents that correspond to loop variables + * \param loop_body The loop body + * \return A stmt, the loop nest + */ + using FMakeForLoop = runtime::TypedPackedFunc loop_vars, Array loop_extents, tvm::tir::Stmt loop_body)>; + /*! \brief The loop variable. */ + Array vars; + /*! \brief The domains of iteration. */ + Array doms; + /*! \brief The for loop generating function. */ + FMakeForLoop f_make_for_loop; + + void VisitAttrs(tvm::AttrVisitor* v) { + TIRFrameNode::VisitAttrs(v); + v->Visit("vars", &vars); + v->Visit("doms", &doms); + // `f_make_for_loop` is not visited. + } + + static constexpr const char* _type_key = "script.ir_builder.tir.ForFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(ForFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to ForFrameNode. + * + * \sa ForFrameNode + */ +class ForFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(ForFrame, TIRFrame, ForFrameNode); +}; + /*! * \brief A frame that represents the assert statement. Proceeds if the condition is true, * otherwise aborts with the message. diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h index aaa5442eede3..68948196ff6b 100644 --- a/include/tvm/script/ir_builder/tir/ir.h +++ b/include/tvm/script/ir_builder/tir/ir.h @@ -141,6 +141,59 @@ void PreflattenedBuffer(Buffer postflattened_buffer, Array shape, */ BlockFrame Block(String name, bool no_realize = false); +/*! + * \brief The serial For statement. + * \param start The minimum value of iteration. + * \param stop The maximum value of iteration. + * \param annotations The optional annotations of the For statement. + * \return The ForFrame. + */ +ForFrame Serial(PrimExpr start, PrimExpr stop, + Optional> annotations = NullOpt); +/*! + * \brief The parallel For statement. + * \param start The minimum value of iteration. + * \param stop The maximum value of iteration. + * \param annotations The optional annotations of the For statement. + * \return The ForFrame. + */ +ForFrame Parallel(PrimExpr start, PrimExpr stop, + Optional> annotations = NullOpt); +/*! + * \brief The vectorized For statement. + * \param start The minimum value of iteration. + * \param stop The maximum value of iteration. + * \param annotations The optional annotations of the For statement. + * \return The ForFrame. + */ +ForFrame Vectorized(PrimExpr start, PrimExpr stop, + Optional> annotations = NullOpt); +/*! + * \brief The unrolled For statement. + * \param start The minimum value of iteration. + * \param stop The maximum value of iteration. + * \param annotations The optional annotations of the For statement. + * \return The ForFrame. + */ +ForFrame Unroll(PrimExpr start, PrimExpr stop, + Optional> annotations = NullOpt); +/*! + * \brief The thread-binding For statement. + * \param start The minimum value of iteration. + * \param stop The maximum value of iteration. + * \param thread The thread for loop variable to bind. + * \param annotations The optional annotations of the For statement. + * \return The ForFrame. + */ +ForFrame ThreadBinding(PrimExpr start, PrimExpr stop, String thread, + Optional> annotations = NullOpt); +/*! + * \brief The grid For statement. + * \param extents The extents of the iteration. + * \return The ForFrame. + */ +ForFrame Grid(Array extents); + /*! * \brief Evaluate the input expression. * \param value The input expression to evaluate. diff --git a/python/tvm/script/ir_builder/tir/frame.py b/python/tvm/script/ir_builder/tir/frame.py index 0e7eb2bb4720..75bb0231aeef 100644 --- a/python/tvm/script/ir_builder/tir/frame.py +++ b/python/tvm/script/ir_builder/tir/frame.py @@ -15,8 +15,10 @@ # specific language governing permissions and limitations # under the License. """IRBuilder for TIR""" +from typing import List, Union from tvm._ffi import register_object as _register_object +from tvm.tir import Var from ..base import IRBuilderFrame @@ -34,3 +36,10 @@ class PrimFuncFrame(TIRFrame): @_register_object("script.ir_builder.tir.BlockFrame") class BlockFrame(TIRFrame): ... + + +@_register_object("script.ir_builder.tir.ForFrame") +class ForFrame(TIRFrame): + def __enter__(self) -> Union[Var, List[Var]]: + super().__enter__() + return self.vars if len(self.vars) > 1 else self.vars[0] diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py index 63fd1291f4bc..a5cdf8a3a105 100644 --- a/python/tvm/script/ir_builder/tir/ir.py +++ b/python/tvm/script/ir_builder/tir/ir.py @@ -344,6 +344,172 @@ def block(name: str = "", no_realize: bool = False) -> frame.BlockFrame: return _ffi_api.Block(name, no_realize) # pylint: disable=no-member # type: ignore +def serial( + start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None +) -> frame.ForFrame: + """The serial For statement. + + Parameters + ---------- + start : PrimExpr + The minimum value of iteration. + + stop : PrimExpr + The maximum value of iteration. + + annotations : Dict[str, Any] + The optional annotations of the For statement. + + Returns + ------- + res : frame.ForFrame + The ForFrame. + """ + if stop is None: + stop = start + start = 0 + return _ffi_api.Serial(start, stop, annotations) # pylint: disable=no-member # type: ignore + + +def parallel( + start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None +) -> frame.ForFrame: + """The parallel For statement. + + Parameters + ---------- + start : PrimExpr + The minimum value of iteration. + + stop : PrimExpr + The maximum value of iteration. + + annotations : Dict[str, Any] + The optional annotations of the For statement. + + Returns + ------- + res : frame.ForFrame + The ForFrame. + """ + if stop is None: + stop = start + start = 0 + return _ffi_api.Parallel(start, stop, annotations) # pylint: disable=no-member # type: ignore + + +def vectorized( + start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None +) -> frame.ForFrame: + """The vectorized For statement. + + Parameters + ---------- + start : PrimExpr + The minimum value of iteration. + + stop : PrimExpr + The maximum value of iteration. + + annotations : Dict[str, Any] + The optional annotations of the For statement. + + Returns + ------- + res : frame.ForFrame + The ForFrame. + """ + if stop is None: + stop = start + start = 0 + return _ffi_api.Vectorized(start, stop, annotations) # pylint: disable=no-member # type: ignore + + +def unroll( + start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None +) -> frame.ForFrame: + """The unrolled For statement. + + Parameters + ---------- + start : PrimExpr + The minimum value of iteration. + + stop : PrimExpr + The maximum value of iteration. + + annotations : Dict[str, Any] + The optional annotations of the For statement. + + Returns + ------- + res : frame.ForFrame + The ForFrame. + """ + if stop is None: + stop = start + start = 0 + return _ffi_api.Unroll(start, stop, annotations) # pylint: disable=no-member # type: ignore + + +def thread_binding( + start: PrimExpr, + stop: PrimExpr = None, + thread: str = None, + *, + annotations: Dict[str, Any] = None, +) -> frame.ForFrame: + """The thread-binding For statement. + + Parameters + ---------- + start : PrimExpr + The minimum value of iteration. + + stop : PrimExpr + The maximum value of iteration. + + thread : str + The thread for loop variable to bind. + + annotations : Dict[str, Any] + The optional annotations of the For statement. + + Returns + ------- + res : frame.ForFrame + The ForFrame. + """ + if thread is None: + if not isinstance(stop, str): + raise ValueError("Thread cannot be None for thread_binding") + thread = stop + stop = start + start = 0 + elif stop is None: + stop = start + start = 0 + return _ffi_api.ThreadBinding( # pylint: disable=no-member # type: ignore + start, stop, thread, annotations + ) + + +def grid(*extents: PrimExpr) -> frame.ForFrame: + """The grid For statement. + + Parameters + ---------- + extents : PrimExpr + The extents of the iteration. + + Returns + ------- + res : frame.ForFrame + The ForFrame. + """ + return _ffi_api.Grid(extents) # pylint: disable=no-member # type: ignore + + def evaluate(value: PrimExpr) -> None: """Evaluate the input expression. @@ -677,6 +843,12 @@ def var(dtype, name="") -> Var: "match_buffer", "preflattened_buffer", "block", + "serial", + "parallel", + "vectorized", + "unroll", + "thread_binding", + "grid", "evaluate", "int8", "int16", diff --git a/src/script/ir_builder/tir/frame.cc b/src/script/ir_builder/tir/frame.cc index dd3097e388b7..e54bf75eeff2 100644 --- a/src/script/ir_builder/tir/frame.cc +++ b/src/script/ir_builder/tir/frame.cc @@ -73,9 +73,15 @@ void BlockFrameNode::ExitWithScope() { } } +void ForFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + AddToParent(this->f_make_for_loop(vars, doms, AsStmt(stmts))); +} + TVM_REGISTER_NODE_TYPE(TIRFrameNode); TVM_REGISTER_NODE_TYPE(PrimFuncFrameNode); TVM_REGISTER_NODE_TYPE(BlockFrameNode); +TVM_REGISTER_NODE_TYPE(ForFrameNode); } // namespace tir } // namespace ir_builder diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc index e2c1218a7e87..22c7face7084 100644 --- a/src/script/ir_builder/tir/ir.cc +++ b/src/script/ir_builder/tir/ir.cc @@ -173,6 +173,74 @@ BlockFrame Block(String name, bool no_realize) { return BlockFrame(n); } +#define TVM_TIR_IR_BUILDER_FOR_FRAME(Method, Kind) \ + ForFrame Method(PrimExpr start, PrimExpr stop, Optional> annotations) { \ + PrimExpr min = start; \ + PrimExpr extent = arith::Analyzer().Simplify(stop - start); \ + ObjectPtr n = make_object(); \ + int bits = std::max(min.dtype().bits(), extent.dtype().bits()); \ + n->vars = {Var("v", DataType::Int(bits))}; \ + n->doms = {Range::FromMinExtent(min, extent)}; \ + n->f_make_for_loop = [annotations](Array vars, Array doms, tvm::tir::Stmt body) { \ + ICHECK_EQ(vars.size(), 1); \ + ICHECK_EQ(doms.size(), 1); \ + return tvm::tir::For(vars[0], doms[0]->min, doms[0]->extent, Kind, body, NullOpt, \ + annotations.value_or(Map())); \ + }; \ + return ForFrame(n); \ + } + +TVM_TIR_IR_BUILDER_FOR_FRAME(Serial, tvm::tir::ForKind::kSerial); +TVM_TIR_IR_BUILDER_FOR_FRAME(Parallel, tvm::tir::ForKind::kParallel); +TVM_TIR_IR_BUILDER_FOR_FRAME(Vectorized, tvm::tir::ForKind::kVectorized); +TVM_TIR_IR_BUILDER_FOR_FRAME(Unroll, tvm::tir::ForKind::kUnrolled); + +#undef TVM_TIR_IR_BUILDER_FOR_FRAME + +ForFrame ThreadBinding(PrimExpr start, PrimExpr stop, String thread, + Optional> annotations) { + using namespace tvm::tir; + PrimExpr min = start; + PrimExpr extent = arith::Analyzer().Simplify(stop - start); + ObjectPtr n = make_object(); + int bits = std::max(min.dtype().bits(), extent.dtype().bits()); + n->vars = {Var("v", DataType::Int(bits))}; + n->doms = {Range::FromMinExtent(min, extent)}; + n->f_make_for_loop = [annotations, thread](Array vars, Array doms, Stmt body) -> For { + ICHECK_EQ(vars.size(), 1); + ICHECK_EQ(doms.size(), 1); + IterVar iter_var(Range(nullptr), Var("iter", DataType::Int(32)), IterVarType::kThreadIndex, + thread); + return For(vars[0], doms[0]->min, doms[0]->extent, ForKind::kThreadBinding, body, iter_var, + annotations.value_or(Map())); + }; + return ForFrame(n); +} + +ForFrame Grid(Array extents) { + using namespace tvm::tir; + ObjectPtr n = make_object(); + n->vars.reserve(extents.size()); + n->doms.reserve(extents.size()); + for (const auto& extent : extents) { + DataType dtype = extent.dtype(); + n->vars.push_back(Var("v", extent.dtype())); + n->doms.push_back(Range(make_const(dtype, 0), extent)); + } + n->f_make_for_loop = [](Array vars, Array doms, Stmt body) -> Stmt { + ICHECK_EQ(vars.size(), doms.size()); + int n = vars.size(); + for (int i = n - 1; i >= 0; --i) { + Range dom = doms[i]; + Var var = vars[i]; + body = For(var, dom->min, dom->extent, ForKind::kSerial, std::move(body), + /*thread_binding=*/NullOpt, /*annotations=*/{}); + } + return body; + }; + return ForFrame(n); +} + void Evaluate(PrimExpr value) { AddToParent(tvm::tir::Evaluate(value)); } using tvm::script::ir_builder::details::Namer; @@ -235,6 +303,14 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.MatchBuffer").set_body_typed(MatchBuf TVM_REGISTER_GLOBAL("script.ir_builder.tir.PreflattenedBuffer").set_body_typed(PreflattenedBuffer); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Block").set_body_typed(Block); + +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Serial").set_body_typed(Serial); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Parallel").set_body_typed(Parallel); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Vectorized").set_body_typed(Vectorized); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Unroll").set_body_typed(Unroll); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.ThreadBinding").set_body_typed(ThreadBinding); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Grid").set_body_typed(Grid); + TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int8").set_body_typed(Int8); diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py index 5c93e99909d9..9cbfd75e2280 100644 --- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py +++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py @@ -114,5 +114,60 @@ def test_ir_builder_tir_block(): assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True) +def test_ir_builder_tir_for(): + with IRBuilder() as ib: + with T.serial(128) as a: + with T.parallel(64) as b: + with T.vectorized(32) as c: + with T.unroll(16) as d: + with T.thread_binding(8, thread="threadIdx.x") as e: + T.evaluate(0) + + # the for generated by IRBuilder + for_actual = ib.get() + + # the expected for + thread_binding_expected = tir.For( + loop_var=tir.Var("", "int32"), + min_val=0, + extent=8, + kind=tir.ForKind.THREAD_BINDING, + body=tir.Evaluate(0), + thread_binding=tir.IterVar( + None, tir.Var("", "int32"), tir.IterVar.ThreadIndex, "threadIdx.x" + ), + ) + unroll_expected = tir.For( + loop_var=tir.Var("", "int32"), + min_val=0, + extent=16, + kind=tir.ForKind.UNROLLED, + body=thread_binding_expected, + ) + vectorized_expected = tir.For( + loop_var=tir.Var("", "int32"), + min_val=0, + extent=32, + kind=tir.ForKind.VECTORIZED, + body=unroll_expected, + ) + parallel_expected = tir.For( + loop_var=tir.Var("", "int32"), + min_val=0, + extent=64, + kind=tir.ForKind.PARALLEL, + body=vectorized_expected, + ) + for_expected = tir.For( + loop_var=tir.Var("", "int32"), + min_val=0, + extent=128, + kind=tir.ForKind.SERIAL, + body=parallel_expected, + ) + # Check if the generated ir is expected + assert_structural_equal(for_actual, for_expected, map_free_vars=True) + + if __name__ == "__main__": tvm.testing.main() From 9a3b3dd1ceac8f9b065636146756baead39b8ab6 Mon Sep 17 00:00:00 2001 From: wrongtest Date: Fri, 16 Sep 2022 07:40:55 +0800 Subject: [PATCH 179/704] [TVMScript] Fix parse minimal i32 literal for tir script (#12772) This change tries to fix an issue due to #12515. Previously the logic for `-2147483648` is `parse(-literal)` = `-parse(literal)`, and all integer literals are converted to i32 (either the literal value actually overflow or not). Since after #12515, parse `2147483648` results in an i64 typed integer rather than i32, `-2147483648` then becomes an i64 integer too, which is not reasonable. --- python/tvm/script/parser.py | 7 +++++++ tests/python/unittest/test_tvmscript_roundtrip.py | 10 ++++++++++ 2 files changed, 17 insertions(+) diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py index e9b4286edad8..c34aae23453c 100644 --- a/python/tvm/script/parser.py +++ b/python/tvm/script/parser.py @@ -906,6 +906,13 @@ def transform_Call(self, node): ) if node.func_name.name in self._unaryop_maker: rhs = self.transform(node.params[0]) + if node.func_name.name == ast.BuiltinOp.USub and isinstance( + node.params[0], ast.Constant + ): + # '-literal' should be parsed together for proper literal type inference + if not isinstance(rhs, (tvm.tir.IntImm, tvm.tir.FloatImm)): + self.report_error("The literal is illegal after -", node.params[0].span) + return tvm.tir.const(-rhs.value) return self._unaryop_maker[node.func_name.name]( rhs, span=tvm_span_from_synr(node.span) ) diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py index 17622789558d..1f5871b488e2 100644 --- a/tests/python/unittest/test_tvmscript_roundtrip.py +++ b/tests/python/unittest/test_tvmscript_roundtrip.py @@ -3381,6 +3381,15 @@ def func( return func +def minimal_i32_literal(): + @T.prim_func + def func() -> None: + T.evaluate(T.int32(-2147483648)) + T.evaluate(-T.int64(2147483648)) + + return func + + ir_generator = tvm.testing.parameter( opt_gemm_normalize, opt_gemm_lower, @@ -3423,6 +3432,7 @@ def func( decl_buffer, allocate_and_decl_buffer, float_infinity, + minimal_i32_literal, ) From c96cc1101ff1a78b69945680574a69c1402a29ff Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Thu, 15 Sep 2022 17:07:39 -0700 Subject: [PATCH 180/704] [community] Fix outdated contributor GitHub usernames (#12799) These couple names were linking to 404 pages, this PR updates them to their current counterparts. Co-authored-by: driazati --- CONTRIBUTORS.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 42f67e87df10..a5da6c8abc79 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -45,7 +45,7 @@ We do encourage everyone to work anything they are interested in. - [Animesh Jain](https://github.com/anijain2305): @anijain2305 - quantization, relay - [Chenfan Jia](https://github.com/jcf94): @jcf94 - auto_scheduler - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler -- [Manupa Karunaratne](https://github.com/manupa-arm): @manupa-arm - ethos-u, memory planner +- [Manupa Karunaratne](https://github.com/manupak): @manupak - ethos-u, memory planner - [Elen Kalda](https://github.com/ekalda): @ekalda - ethos-u, arm - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - relay - [Tristan Konolige](https://github.com/tkonolige): @tkonolige - profiling, relay, tir, runtime @@ -70,7 +70,7 @@ We do encourage everyone to work anything they are interested in. - [Giuseppe Rossini](https://github.com/giuseros): @giuseros - aot, arm - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel - frontends - [Christopher Sidebottom](https://github.com/Mousius): @Mousius - arm, ethos-u, relay -- [Junru Shao](https://github.com/junrushao1994) (PMC): @junrushao1994 - relay, compiler +- [Junru Shao](https://github.com/junrushao) (PMC): @junrushao - relay, compiler - [Haichen Shen](https://github.com/icemelon) (PMC): @icemelon - relay, topi - [Chris Sullivan](https://github.com/csullivan): @csullivan - amd backend - [Siva Rama Krishna Reddy](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang @@ -85,7 +85,7 @@ We do encourage everyone to work anything they are interested in. - [Hao Yu](https://github.com/comaniac): @comaniac (PMC) - relay, byoc, auto_scheduler - [Lianmin Zheng](https://github.com/merrymercy) (PMC): @merrymercy - autotvm, auto_scheduler, topi, relay - [Xiyou Zhou](https://github.com/zxybazh): @zxybazh - relay -- [wrongtest](https://github.com/wrongtest): @wrongtest - tir, tvm-script, arith +- [wrongtest](https://github.com/wrongtest-intellif): @wrongtest-intellif - tir, tvm-script, arith ## Reviewers @@ -120,7 +120,7 @@ We do encourage everyone to work anything they are interested in. - [Hua Jiang](https://github.com/huajsj): @huajsj - [Ziheng Jiang](https://github.com/ZihengJiang): @ZihengJiang - [Hongyi Jin](https://github.com/jinhongyii): @jinhongyii -- [Manupa Karunaratne](https://github.com/manupa-arm): @manupa-arm +- [Manupa Karunaratne](https://github.com/manupak): @manupak - [Elen Kalda](https://github.com/ekalda): @ekalda - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - [Michael J. Klaiber](https://github.com/MichaelJKlaiber/) @MichaelJKlaiber @@ -162,7 +162,7 @@ We do encourage everyone to work anything they are interested in. - [Gustavo Romero](https://github.com/gromero): @gromero - [Giuseppe Rossini](https://github.com/giuseros): @giuseros - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel -- [Junru Shao](https://github.com/junrushao1994): @junrushao1994 +- [Junru Shao](https://github.com/junrushao): @junrushao - [Haichen Shen](https://github.com/icemelon): @icemelon - [Xingjian Shi](https://github.com/sxjscience): @sxjscience - [Yuanjing Shi](https://github.com/shingjan): @shingjan @@ -187,7 +187,7 @@ We do encourage everyone to work anything they are interested in. - [Logan Weber](https://github.com/weberlo): @weberlo - [Matt Welsh](https://github.com/mdw-octoml): @mdw-octoml - [Jian Weng](https://github.com/were): @were -- [wrongtest](https://github.com/wrongtest): @wrongtest +- [wrongtest](https://github.com/wrongtest-intellif): @wrongtest-intellif - [Yong Wu](https://github.com/yongwww): @yongwww - [Zhao Wu](https://github.com/FrozenGene): @FrozenGene - [Bing Xu](https://github.com/antinucleon): @antinucleon From e6525a30e6de3bc3f95564beeead8e9e8b1f9efc Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 15 Sep 2022 18:49:22 -0700 Subject: [PATCH 181/704] [TIR] Add extra simpliciation in region cover analysis (#12800) Added extra simplify step to eliminate false negative cases. --- src/tir/schedule/state.cc | 5 ++ .../test_tir_schedule_state_cached_flags.py | 86 ++++++++++++++++++- 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/src/tir/schedule/state.cc b/src/tir/schedule/state.cc index 15d0e08ddc2c..6d4a42236f57 100644 --- a/src/tir/schedule/state.cc +++ b/src/tir/schedule/state.cc @@ -108,6 +108,11 @@ bool ProducerCoversConsumer(const Array& buffer_shape, produced = arith::Intersect({produced, buffer_size}); consumed = arith::Intersect({consumed, buffer_size}); + produced = arith::IntSet::Interval(analyzer->Simplify(produced.min()), + analyzer->Simplify(produced.max())); + consumed = arith::IntSet::Interval(analyzer->Simplify(consumed.min()), + analyzer->Simplify(consumed.max())); + if (!analyzer->CanProve((analyzer->canonical_simplify(produced.min() - consumed.min()) <= 0) && (analyzer->canonical_simplify(consumed.max() - produced.max()) <= 0))) { return false; diff --git a/tests/python/unittest/test_tir_schedule_state_cached_flags.py b/tests/python/unittest/test_tir_schedule_state_cached_flags.py index bbeb8d87600b..987821714078 100644 --- a/tests/python/unittest/test_tir_schedule_state_cached_flags.py +++ b/tests/python/unittest/test_tir_schedule_state_cached_flags.py @@ -26,7 +26,7 @@ from tvm.tir.stmt_functor import post_order_visit # pylint: disable=no-member,invalid-name,unused-variable,unexpected-keyword-arg - +# fmt: off @T.prim_func def elementwise(a: T.handle, c: T.handle) -> None: @@ -366,7 +366,80 @@ def uncovered_producer_region(A: T.Buffer[(128,), "float32"], B: T.Buffer[(128,) B[vi] = A[vi] +@T.prim_func +def matmul_relu_padding(A: T.Buffer[(127, 127), "float16"], B: T.Buffer[(127, 127), "float16"], compute: T.Buffer[(127, 127), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C = T.alloc_buffer([127, 127], dtype="float32") + A_reindex = T.alloc_buffer([128, 128], dtype="float16") + B_reindex = T.alloc_buffer([128, 128], dtype="float16") + C_reindex_shared = T.alloc_buffer([128, 128], dtype="float32", scope="shared") + C_reindex_shared_wmma_accumulator = T.alloc_buffer([128, 128], dtype="float32", scope="wmma.accumulator") + for ax0, ax1, ax2 in T.grid(128, 1, 128): + with T.block("A_reindex"): + v0, v1, v2 = T.axis.remap("SSS", [ax0, ax1, ax2]) + T.reads(A[v0, v2]) + T.writes(A_reindex[v0, v2]) + A_reindex[v0, v2] = T.if_then_else(v0 < 127 and v2 < 127, A[v0, v2], T.float16(0), dtype="float16") + for ax0, ax1, ax2 in T.grid(1, 128, 128): + with T.block("B_reindex"): + v0, v1, v2 = T.axis.remap("SSS", [ax0, ax1, ax2]) + T.reads(B[v2, v1]) + T.writes(B_reindex[v2, v1]) + B_reindex[v2, v1] = T.if_then_else(v2 < 127 and v1 < 127, B[v2, v1], T.float16(0), dtype="float16") + for ax0_0_0_ax1_0_0_fused in T.thread_binding(2, thread="blockIdx.y"): + for ax0_0_1_ax1_0_1_fused in T.thread_binding(1, thread="blockIdx.x"): + for ax0_0_2_ax1_0_2_fused in T.thread_binding(16, thread="threadIdx.y"): + for ax2_0_0, ax2_0_1, ax0_0_3, ax1_0_3, ax2_0_2, ax0_0_4, ax1_0_4 in T.grid(2, 2, 1, 2, 2, 1, 1): + with T.block("C_o"): + v0_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused // 2 + ax0_0_3 + ax0_0_4) + v1_o = T.axis.spatial(8, ax1_0_4 + ax0_0_0_ax1_0_0_fused * 4 + ax0_0_2_ax1_0_2_fused % 2 * 2 + ax1_0_3) + v2_o = T.axis.reduce(8, ax2_0_0 * 4 + ax2_0_1 * 2 + ax2_0_2) + T.reads(A_reindex[v0_o * 16 : v0_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16], B_reindex[v2_o * 16 : v2_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1}) + with T.init(): + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("C_init"): + v0_i_init, v1_i_init = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads() + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init]) + C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init] = T.float32(0) + for ax0_1, ax1_1, ax2_1 in T.grid(16, 16, 16): + with T.block("C"): + v0_i, v1_i, v2_i = T.axis.remap("SSR", [ax0_1, ax1_1, ax2_1]) + T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i], A_reindex[v0_o * 16 + v0_i, v2_o * 16 + v2_i], B_reindex[v2_o * 16 + v2_i, v1_o * 16 + v1_i]) + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"}) + C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(A_reindex[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(B_reindex[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32") + for ax0, ax1 in T.grid(16, 32): + with T.block("C_reindex_shared_wmma.accumulator"): + v0 = T.axis.spatial(128, ax0_0_2_ax1_0_2_fused // 2 * 16 + ax0) + v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused * 64 + ax0_0_2_ax1_0_2_fused % 2 * 32 + ax1) + T.reads(C_reindex_shared_wmma_accumulator[v0, v1]) + T.writes(C_reindex_shared[v0, v1]) + C_reindex_shared[v0, v1] = C_reindex_shared_wmma_accumulator[v0, v1] + for ax0, ax1 in T.grid(128, 64): + with T.block("C_reindex_shared"): + v0 = T.axis.spatial(128, ax0) + v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused * 64 + ax1) + T.where(ax0 < 127 and ax0_0_0_ax1_0_0_fused * 64 + ax1 < 127) + T.reads(C_reindex_shared[v0, v1]) + T.writes(C[v0, v1]) + T.block_attr({"meta_schedule.cooperative_fetch":3}) + C[v0, v1] = C_reindex_shared[v0, v1] + for i0, i1 in T.grid(127, 127): + with T.block("compute"): + i0_1, i1_1 = T.axis.remap("SS", [i0, i1]) + T.reads(C[i0_1, i1_1]) + T.writes(compute[i0_1, i1_1]) + compute[i0_1, i1_1] = T.max(C[i0_1, i1_1], T.float32(0)) + + # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg +# fmt: on def _get_block(s: tir.ScheduleState, name_hint: str) -> tir.StmtSRef: @@ -781,5 +854,16 @@ def test_uncovered_producer_region(): # pylint: enable=protected-access +def test_matmul_relu_padding(): + s = tir.ScheduleState(matmul_relu_padding, debug_mask="all") + # pylint: disable=protected-access + assert s._get_cached_flags(_get_block(s, "C_reindex_shared")) == CachedFlags( + affine_binding=True, + region_cover=True, + stage_pipeline=True, + ) + # pylint: enable=protected-access + + if __name__ == "__main__": tvm.testing.main() From 02c2eae510d6d6c15189427c97819f7ce05f002d Mon Sep 17 00:00:00 2001 From: Xiyou Zhou Date: Thu, 15 Sep 2022 22:01:33 -0700 Subject: [PATCH 182/704] [MetaSchedule] Enable Clone Function for Task-Level Classes (#12796) This PR introduces a clone function for each of the task-level MetaSchedule classes for convenient class deep copying. - [x] ScheduleRule - [x] Postproc - [x] Mutator - [x] SpaceGenerator - [x] SearchStrategy - [x] TuneContext --- include/tvm/meta_schedule/mutator.h | 88 ++++++++------ include/tvm/meta_schedule/postproc.h | 86 ++++++++----- include/tvm/meta_schedule/schedule_rule.h | 86 ++++++++----- include/tvm/meta_schedule/search_strategy.h | 114 +++++++++++------- include/tvm/meta_schedule/space_generator.h | 78 +++++++----- include/tvm/meta_schedule/tune_context.h | 6 + python/tvm/meta_schedule/mutator/mutator.py | 24 +++- python/tvm/meta_schedule/postproc/postproc.py | 24 +++- .../schedule_rule/schedule_rule.py | 32 +++-- .../search_strategy/search_strategy.py | 23 ++++ .../space_generator/space_generator.py | 24 +++- .../tvm/meta_schedule/testing/dummy_object.py | 3 + python/tvm/meta_schedule/tune_context.py | 10 ++ .../mutator/mutate_compute_location.cc | 5 + src/meta_schedule/mutator/mutate_parallel.cc | 5 + .../mutator/mutate_thread_binding.cc | 5 + src/meta_schedule/mutator/mutate_tile_size.cc | 5 + src/meta_schedule/mutator/mutate_unroll.cc | 5 + src/meta_schedule/mutator/mutator.cc | 8 ++ .../postproc/disallow_dynamic_loop.cc | 5 + src/meta_schedule/postproc/postproc.cc | 8 ++ .../postproc/rewrite_cooperative_fetch.cc | 5 + src/meta_schedule/postproc/rewrite_layout.cc | 5 + .../rewrite_parallel_vectorize_unroll.cc | 6 + .../postproc/rewrite_reduction_block.cc | 5 + .../postproc/rewrite_tensorize.cc | 5 + .../postproc/rewrite_unbound_block.cc | 5 + src/meta_schedule/postproc/verify_gpu_code.cc | 6 + .../schedule_rule/add_rfactor.cc | 6 + src/meta_schedule/schedule_rule/auto_bind.cc | 6 + .../schedule_rule/auto_inline.cc | 6 + .../schedule_rule/cross_thread_reduction.cc | 6 + .../schedule_rule/multi_level_tiling.cc | 6 + .../schedule_rule/multi_level_tiling.h | 3 + .../multi_level_tiling_tensor_core.cc | 7 ++ .../multi_level_tiling_with_intrin.cc | 7 ++ .../parallel_vectorize_unroll.cc | 7 ++ .../schedule_rule/random_compute_location.cc | 6 + .../schedule_rule/schedule_rule.cc | 9 ++ .../search_strategy/evolutionary_search.cc | 18 +++ .../search_strategy/replay_func.cc | 10 ++ .../search_strategy/replay_trace.cc | 11 ++ .../search_strategy/search_strategy.cc | 11 +- .../space_generator/post_order_apply.cc | 9 ++ .../space_generator/schedule_fn.cc | 5 + .../space_generator/space_generator.cc | 12 +- .../space_generator/space_generator_union.cc | 9 ++ src/meta_schedule/tune_context.cc | 26 ++++ 48 files changed, 675 insertions(+), 186 deletions(-) diff --git a/include/tvm/meta_schedule/mutator.h b/include/tvm/meta_schedule/mutator.h index 566cc82e9716..2b580e75e019 100644 --- a/include/tvm/meta_schedule/mutator.h +++ b/include/tvm/meta_schedule/mutator.h @@ -32,6 +32,7 @@ namespace tvm { namespace meta_schedule { class TuneContext; +class Mutator; /*! \brief Mutator is designed to mutate the trace to explore the design space. */ class MutatorNode : public runtime::Object { @@ -57,12 +58,21 @@ class MutatorNode : public runtime::Object { virtual Optional Apply(const tir::Trace& trace, support::LinearCongruentialEngine::TRandState* rand_state) = 0; + /*! + * \brief Clone the mutator. + * \return The cloned mutator. + */ + virtual Mutator Clone() const = 0; + static constexpr const char* _type_key = "meta_schedule.Mutator"; TVM_DECLARE_BASE_OBJECT_INFO(MutatorNode, Object); }; -/*! \brief The mutator with customized methods on the python-side. */ -class PyMutatorNode : public MutatorNode { +/*! + * \brief Managed reference to MutatorNode + * \sa MutatorNode + */ +class Mutator : public runtime::ObjectRef { public: /*! * \brief The function type of `InitializeWithTuneContext` method. @@ -76,39 +86,16 @@ class PyMutatorNode : public MutatorNode { */ using FApply = runtime::TypedPackedFunc( const tir::Trace&, support::LinearCongruentialEngine::TRandState rand_state)>; + /*! + * \brief Clone the mutator. + * \return The cloned mutator. + */ + using FClone = runtime::TypedPackedFunc; /*! * \brief Get the mutator as string with name. * \return The string of the mutator. */ using FAsString = runtime::TypedPackedFunc; - - /*! \brief The packed function to the `InitializeWithTuneContext` function. */ - FInitializeWithTuneContext f_initialize_with_tune_context; - /*! \brief The packed function to the `Apply` function. */ - FApply f_apply; - /*! \brief The packed function to the `AsString` function. */ - FAsString f_as_string; - - void VisitAttrs(tvm::AttrVisitor* v) { - // `f_initialize_with_tune_context` is not visited - // `f_apply` is not visited - // `f_as_string` is not visited - } - - void InitializeWithTuneContext(const TuneContext& context) final; - Optional Apply(const tir::Trace& trace, - support::LinearCongruentialEngine::TRandState* rand_state) final; - - static constexpr const char* _type_key = "meta_schedule.PyMutator"; - TVM_DECLARE_FINAL_OBJECT_INFO(PyMutatorNode, MutatorNode); -}; - -/*! - * \brief Managed reference to MutatorNode - * \sa MutatorNode - */ -class Mutator : public runtime::ObjectRef { - public: /*! \brief Create a Mutator that mutates the decision of instruction Sample-Perfect-Tile */ TVM_DLL static Mutator MutateTileSize(); /*! @@ -136,16 +123,49 @@ class Mutator : public runtime::ObjectRef { * \brief Create a mutator with customized methods on the python-side. * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`. * \param f_apply The packed function of `Apply`. + * \param f_clone The packed function of `Clone`. * \param f_as_string The packed function of `AsString`. * \return The mutator created. */ - TVM_DLL static Mutator PyMutator( - PyMutatorNode::FInitializeWithTuneContext f_initialize_with_tune_context, // - PyMutatorNode::FApply f_apply, // - PyMutatorNode::FAsString f_as_string); + TVM_DLL static Mutator PyMutator(FInitializeWithTuneContext f_initialize_with_tune_context, // + FApply f_apply, // + FClone f_clone, // + FAsString f_as_string); TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Mutator, ObjectRef, MutatorNode); }; +/*! \brief The mutator with customized methods on the python-side. */ +class PyMutatorNode : public MutatorNode { + public: + using FInitializeWithTuneContext = Mutator::FInitializeWithTuneContext; + using FApply = Mutator::FApply; + using FClone = Mutator::FClone; + using FAsString = Mutator::FAsString; + /*! \brief The packed function to the `InitializeWithTuneContext` function. */ + FInitializeWithTuneContext f_initialize_with_tune_context; + /*! \brief The packed function to the `Apply` function. */ + FApply f_apply; + /*! \brief The packed function to the `Clone` function. */ + FClone f_clone; + /*! \brief The packed function to the `AsString` function. */ + FAsString f_as_string; + + void VisitAttrs(tvm::AttrVisitor* v) { + // `f_initialize_with_tune_context` is not visited + // `f_apply` is not visited + // `f_clone` is not visited + // `f_as_string` is not visited + } + + void InitializeWithTuneContext(const TuneContext& context) final; + Optional Apply(const tir::Trace& trace, + support::LinearCongruentialEngine::TRandState* rand_state) final; + Mutator Clone() const final; + + static constexpr const char* _type_key = "meta_schedule.PyMutator"; + TVM_DECLARE_FINAL_OBJECT_INFO(PyMutatorNode, MutatorNode); +}; + } // namespace meta_schedule } // namespace tvm diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h index 5d99f6845463..4fafb9557631 100644 --- a/include/tvm/meta_schedule/postproc.h +++ b/include/tvm/meta_schedule/postproc.h @@ -29,6 +29,7 @@ namespace tvm { namespace meta_schedule { class TuneContext; +class Postproc; /*! * \brief Rules to apply a postprocessor to a schedule. @@ -54,12 +55,21 @@ class PostprocNode : public runtime::Object { */ virtual bool Apply(const tir::Schedule& sch) = 0; + /*! + * \brief Clone the postprocessor. + * \return The cloned postprocessor. + */ + virtual Postproc Clone() const = 0; + static constexpr const char* _type_key = "meta_schedule.Postproc"; TVM_DECLARE_BASE_OBJECT_INFO(PostprocNode, Object); }; -/*! \brief The postprocessor with customized methods on the python-side. */ -class PyPostprocNode : public PostprocNode { +/*! + * \brief Managed reference to PostprocNode + * \sa PostprocNode + */ +class Postproc : public runtime::ObjectRef { public: /*! * \brief The function type of `InitializeWithTuneContext` method. @@ -72,49 +82,28 @@ class PyPostprocNode : public PostprocNode { * \return Whether the postprocessor was successfully applied. */ using FApply = runtime::TypedPackedFunc; + /*! + * \brief Clone the postprocessor. + * \return The cloned postprocessor. + */ + using FClone = runtime::TypedPackedFunc; /*! * \brief Get the postprocessor function as string with name. * \return The string of the postprocessor function. */ using FAsString = runtime::TypedPackedFunc; - - /*! \brief The packed function to the `InitializeWithTuneContext` function. */ - FInitializeWithTuneContext f_initialize_with_tune_context; - /*! \brief The packed function to the `Apply` function. */ - FApply f_apply; - /*! \brief The packed function to the `AsString` function. */ - FAsString f_as_string; - - void VisitAttrs(tvm::AttrVisitor* v) { - // `f_initialize_with_tune_context` is not visited - // `f_apply` is not visited - // `f_as_string` is not visited - } - - void InitializeWithTuneContext(const TuneContext& context) final; - bool Apply(const tir::Schedule& sch) final; - - static constexpr const char* _type_key = "meta_schedule.PyPostproc"; - TVM_DECLARE_FINAL_OBJECT_INFO(PyPostprocNode, PostprocNode); -}; - -/*! - * \brief Managed reference to PostprocNode - * \sa PostprocNode - */ -class Postproc : public runtime::ObjectRef { - public: /*! * \brief Create a postprocessor with customized methods on the python-side. * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`. * \param f_apply The packed function of `Apply`. + * \param f_clone The packed function of `Clone`. * \param f_as_string The packed function of `AsString`. * \return The postprocessor created. */ - TVM_DLL static Postproc PyPostproc( - PyPostprocNode::FInitializeWithTuneContext f_initialize_with_tune_context, // - PyPostprocNode::FApply f_apply, // - PyPostprocNode::FAsString f_as_string); + TVM_DLL static Postproc PyPostproc(FInitializeWithTuneContext f_initialize_with_tune_context, // + FApply f_apply, // + FClone f_clone, // + FAsString f_as_string); /*! * \brief Create a postprocessor that checks if all loops are static * \return The postprocessor created @@ -164,6 +153,37 @@ class Postproc : public runtime::ObjectRef { TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Postproc, ObjectRef, PostprocNode); }; +/*! \brief The postprocessor with customized methods on the python-side. */ +class PyPostprocNode : public PostprocNode { + public: + using FInitializeWithTuneContext = Postproc::FInitializeWithTuneContext; + using FApply = Postproc::FApply; + using FClone = Postproc::FClone; + using FAsString = Postproc::FAsString; + /*! \brief The packed function to the `InitializeWithTuneContext` function. */ + FInitializeWithTuneContext f_initialize_with_tune_context; + /*! \brief The packed function to the `Apply` function. */ + FApply f_apply; + /*! \brief The packed function to the `Clone` function. */ + FClone f_clone; + /*! \brief The packed function to the `AsString` function. */ + FAsString f_as_string; + + void VisitAttrs(tvm::AttrVisitor* v) { + // `f_initialize_with_tune_context` is not visited + // `f_apply` is not visited + // `f_clone` is not visited + // `f_as_string` is not visited + } + + void InitializeWithTuneContext(const TuneContext& context) final; + bool Apply(const tir::Schedule& sch) final; + Postproc Clone() const final; + + static constexpr const char* _type_key = "meta_schedule.PyPostproc"; + TVM_DECLARE_FINAL_OBJECT_INFO(PyPostprocNode, PostprocNode); +}; + } // namespace meta_schedule } // namespace tvm diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h index 2da441c95e0b..55704cf4a97d 100644 --- a/include/tvm/meta_schedule/schedule_rule.h +++ b/include/tvm/meta_schedule/schedule_rule.h @@ -34,6 +34,7 @@ namespace tvm { namespace meta_schedule { class TuneContext; +class ScheduleRule; /*! \brief Rules to modify a block in a schedule. */ class ScheduleRuleNode : public runtime::Object { @@ -59,12 +60,21 @@ class ScheduleRuleNode : public runtime::Object { virtual runtime::Array Apply(const tir::Schedule& sch, const tir::BlockRV& block) = 0; + /*! + * \brief Deep clone the schedule rule. + * \return The cloned schedule rule. + */ + virtual ScheduleRule Clone() const = 0; + static constexpr const char* _type_key = "meta_schedule.ScheduleRule"; TVM_DECLARE_BASE_OBJECT_INFO(ScheduleRuleNode, Object); }; -/*! \brief The schedule rule with customized methods on the python-side. */ -class PyScheduleRuleNode : public ScheduleRuleNode { +/*! + * \brief Managed reference to ScheduleRuleNode + * \sa ScheduleRuleNode + */ +class ScheduleRule : public runtime::ObjectRef { public: /*! * \brief The function type of `InitializeWithTuneContext` method. @@ -84,33 +94,11 @@ class PyScheduleRuleNode : public ScheduleRuleNode { * \return The string of the schedule rule. */ using FAsString = runtime::TypedPackedFunc; - - /*! \brief The packed function to the `InitializeWithTuneContext` function. */ - FInitializeWithTuneContext f_initialize_with_tune_context; - /*! \brief The packed function to the `Apply` function. */ - FApply f_apply; - /*! \brief The packed function to the `AsString` function. */ - FAsString f_as_string; - - void VisitAttrs(tvm::AttrVisitor* v) { - // `f_initialize_with_tune_context` is not visited - // `f_apply` is not visited - // `f_as_string` is not visited - } - - void InitializeWithTuneContext(const TuneContext& context) final; - Array Apply(const tir::Schedule& sch, const tir::BlockRV& block) final; - - static constexpr const char* _type_key = "meta_schedule.PyScheduleRule"; - TVM_DECLARE_FINAL_OBJECT_INFO(PyScheduleRuleNode, ScheduleRuleNode); -}; - -/*! - * \brief Managed reference to ScheduleRuleNode - * \sa ScheduleRuleNode - */ -class ScheduleRule : public runtime::ObjectRef { - public: + /*! + * \brief The function type of `Clone` method. + * \return The cloned schedule rule. + */ + using FClone = runtime::TypedPackedFunc; /*! * \brief Create an auto-inline rule that inlines spatial blocks if it satisfies some conditions * \param into_producer If allows to inline a block into its producer @@ -249,16 +237,50 @@ class ScheduleRule : public runtime::ObjectRef { * \brief Create a schedule rule with customized methods on the python-side. * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`. * \param f_apply The packed function of `Apply`. + * \param f_clone The packed function of `Clone`. * \param f_as_string The packed function of `AsString`. * \return The schedule rule created. */ TVM_DLL static ScheduleRule PyScheduleRule( - PyScheduleRuleNode::FInitializeWithTuneContext f_initialize_with_tune_context, // - PyScheduleRuleNode::FApply f_apply, // - PyScheduleRuleNode::FAsString f_as_string); + FInitializeWithTuneContext f_initialize_with_tune_context, // + FApply f_apply, // + FClone f_clone, // + FAsString f_as_string); TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(ScheduleRule, ObjectRef, ScheduleRuleNode); }; +/*! \brief The schedule rule with customized methods on the python-side. */ +class PyScheduleRuleNode : public ScheduleRuleNode { + public: + using FInitializeWithTuneContext = ScheduleRule::FInitializeWithTuneContext; + using FApply = ScheduleRule::FApply; + using FClone = ScheduleRule::FClone; + using FAsString = ScheduleRule::FAsString; + + /*! \brief The packed function to the `InitializeWithTuneContext` function. */ + FInitializeWithTuneContext f_initialize_with_tune_context; + /*! \brief The packed function to the `Apply` function. */ + FApply f_apply; + /*! \brief The packed function to the `AsString` function. */ + FAsString f_as_string; + /*! \brief The packed function to the `Clone` function. */ + FClone f_clone; + + void VisitAttrs(tvm::AttrVisitor* v) { + // `f_initialize_with_tune_context` is not visited + // `f_apply` is not visited + // `f_as_string` is not visited + // `f_clone` is not visited + } + + void InitializeWithTuneContext(const TuneContext& context) final; + Array Apply(const tir::Schedule& sch, const tir::BlockRV& block) final; + ScheduleRule Clone() const final; + + static constexpr const char* _type_key = "meta_schedule.PyScheduleRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(PyScheduleRuleNode, ScheduleRuleNode); +}; + } // namespace meta_schedule } // namespace tvm diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h index a75a4cd8ae86..efd3dc24524a 100644 --- a/include/tvm/meta_schedule/search_strategy.h +++ b/include/tvm/meta_schedule/search_strategy.h @@ -36,6 +36,7 @@ namespace meta_schedule { // Forward declaration class TuneContext; +class SearchStrategy; /*! * \brief The search strategy for measure candidates generation. @@ -119,12 +120,21 @@ class SearchStrategyNode : public runtime::Object { virtual void NotifyRunnerResults(const Array& measure_candidates, const Array& results) = 0; + /*! + * \brief Clone the search strategy. + * \return The cloned search strategy. + */ + virtual SearchStrategy Clone() const = 0; + static constexpr const char* _type_key = "meta_schedule.SearchStrategy"; TVM_DECLARE_BASE_OBJECT_INFO(SearchStrategyNode, Object); }; -/*! \brief The python side customizable class for measure candidate generation */ -class PySearchStrategyNode : public SearchStrategyNode { +/*! + * \brief Managed reference to SearchStrategyNode. + * \sa SearchStrategyNode + */ +class SearchStrategy : public runtime::ObjectRef { public: /*! * \brief The function type of `InitializeWithTuneContext` method. @@ -150,44 +160,11 @@ class PySearchStrategyNode : public SearchStrategyNode { */ using FNotifyRunnerResults = runtime::TypedPackedFunc&, const Array&)>; - - /*! \brief The packed function to the `InitializeWithTuneContext` method. */ - FInitializeWithTuneContext f_initialize_with_tune_context; - /*! \brief The packed function to the `PreTuning` method. */ - FPreTuning f_pre_tuning; - /*! \brief The packed function to the `PostTuning` method. */ - FPostTuning f_post_tuning; - /*! \brief The packed function to the `GenerateMeasureCandidates` method. */ - FGenerateMeasureCandidates f_generate_measure_candidates; - /*! \brief The packed function to the `NotifyRunnerResults` method. */ - FNotifyRunnerResults f_notify_runner_results; - - void VisitAttrs(tvm::AttrVisitor* v) { - // `f_initialize_with_tune_context` is not visited - // `f_pre_tuning` is not visited - // `f_post_tuning` is not visited - // `f_generate_measure_candidates` is not visited - // `f_notify_runner_results` is not visited - } - - void InitializeWithTuneContext(const TuneContext& context) final; - void PreTuning(const Array& design_spaces, const Optional& database, - const Optional& cost_model) final; - void PostTuning() final; - Optional> GenerateMeasureCandidates() final; - void NotifyRunnerResults(const Array& measure_candidates, - const Array& results); - - static constexpr const char* _type_key = "meta_schedule.PySearchStrategy"; - TVM_DECLARE_FINAL_OBJECT_INFO(PySearchStrategyNode, SearchStrategyNode); -}; - -/*! - * \brief Managed reference to SearchStrategyNode. - * \sa SearchStrategyNode - */ -class SearchStrategy : public runtime::ObjectRef { - public: + /*! + * \brief The function type of `Clone` method. + * \return The cloned search strategy. + */ + using FClone = runtime::TypedPackedFunc; /*! * \brief Create a search strategy with customized methods on the python-side. * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`. @@ -195,14 +172,16 @@ class SearchStrategy : public runtime::ObjectRef { * \param f_post_tuning The packed function of `PostTuning`. * \param f_generate_measure_candidates The packed function of `GenerateMeasureCandidates`. * \param f_notify_runner_results The packed function of `NotifyRunnerResults`. + * \param f_clone The packed function of `Clone`. * \return The search strategy created. */ TVM_DLL static SearchStrategy PySearchStrategy( - PySearchStrategyNode::FInitializeWithTuneContext f_initialize_with_tune_context, // - PySearchStrategyNode::FPreTuning f_pre_tuning, // - PySearchStrategyNode::FPostTuning f_post_tuning, // - PySearchStrategyNode::FGenerateMeasureCandidates f_generate_measure_candidates, // - PySearchStrategyNode::FNotifyRunnerResults f_notify_runner_results); + FInitializeWithTuneContext f_initialize_with_tune_context, // + FPreTuning f_pre_tuning, // + FPostTuning f_post_tuning, // + FGenerateMeasureCandidates f_generate_measure_candidates, // + FNotifyRunnerResults f_notify_runner_results, // + FClone f_clone); /*! * \brief Constructor of replay trace search strategy. @@ -245,6 +224,51 @@ class SearchStrategy : public runtime::ObjectRef { TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(SearchStrategy, ObjectRef, SearchStrategyNode); }; +/*! \brief The python side customizable class for measure candidate generation */ +class PySearchStrategyNode : public SearchStrategyNode { + public: + using FInitializeWithTuneContext = SearchStrategy::FInitializeWithTuneContext; + using FPreTuning = SearchStrategy::FPreTuning; + using FPostTuning = SearchStrategy::FPostTuning; + using FGenerateMeasureCandidates = SearchStrategy::FGenerateMeasureCandidates; + using FNotifyRunnerResults = SearchStrategy::FNotifyRunnerResults; + using FClone = SearchStrategy::FClone; + + /*! \brief The packed function to the `InitializeWithTuneContext` method. */ + FInitializeWithTuneContext f_initialize_with_tune_context; + /*! \brief The packed function to the `PreTuning` method. */ + FPreTuning f_pre_tuning; + /*! \brief The packed function to the `PostTuning` method. */ + FPostTuning f_post_tuning; + /*! \brief The packed function to the `GenerateMeasureCandidates` method. */ + FGenerateMeasureCandidates f_generate_measure_candidates; + /*! \brief The packed function to the `NotifyRunnerResults` method. */ + FNotifyRunnerResults f_notify_runner_results; + /*! \brief The packed function to the `Clone` method. */ + FClone f_clone; + + void VisitAttrs(tvm::AttrVisitor* v) { + // `f_initialize_with_tune_context` is not visited + // `f_pre_tuning` is not visited + // `f_post_tuning` is not visited + // `f_generate_measure_candidates` is not visited + // `f_notify_runner_results` is not visited + // `f_clone` is not visited + } + + void InitializeWithTuneContext(const TuneContext& context) final; + void PreTuning(const Array& design_spaces, const Optional& database, + const Optional& cost_model) final; + void PostTuning() final; + Optional> GenerateMeasureCandidates() final; + void NotifyRunnerResults(const Array& measure_candidates, + const Array& results); + SearchStrategy Clone() const final; + + static constexpr const char* _type_key = "meta_schedule.PySearchStrategy"; + TVM_DECLARE_FINAL_OBJECT_INFO(PySearchStrategyNode, SearchStrategyNode); +}; + } // namespace meta_schedule } // namespace tvm diff --git a/include/tvm/meta_schedule/space_generator.h b/include/tvm/meta_schedule/space_generator.h index 2c1b2d4e4d7d..1e29e757a15c 100644 --- a/include/tvm/meta_schedule/space_generator.h +++ b/include/tvm/meta_schedule/space_generator.h @@ -31,6 +31,7 @@ namespace meta_schedule { // Forward declaration class TuneContext; +class SpaceGenerator; /*! * \brief The abstract class for design space generation. @@ -87,12 +88,21 @@ class SpaceGeneratorNode : public runtime::Object { */ virtual Array GenerateDesignSpace(const IRModule& mod) = 0; + /*! + * \brief Clone the space generator. + * \return The cloned space generator. + */ + virtual SpaceGenerator Clone() const = 0; + static constexpr const char* _type_key = "meta_schedule.SpaceGenerator"; TVM_DECLARE_BASE_OBJECT_INFO(SpaceGeneratorNode, Object); }; -/*! \brief The design space generator with customized methods on the python-side. */ -class PySpaceGeneratorNode : public SpaceGeneratorNode { +/*! + * \brief Managed reference to SpaceGeneratorNode. + * \sa SpaceGeneratorNode + */ +class SpaceGenerator : public runtime::ObjectRef { public: /*! * \brief The function type of `InitializeWithTuneContext` method. @@ -105,29 +115,12 @@ class PySpaceGeneratorNode : public SpaceGeneratorNode { * \return The generated design spaces, i.e., schedules. */ using FGenerateDesignSpace = runtime::TypedPackedFunc(const IRModule&)>; + /*! + * \brief The function type of `Clone` method. + * \return The cloned space generator. + */ + using FClone = runtime::TypedPackedFunc; - /*! \brief The packed function to the `InitializeWithTuneContext` function. */ - FInitializeWithTuneContext f_initialize_with_tune_context; - /*! \brief The packed function to the `GenerateDesignSpace` function. */ - FGenerateDesignSpace f_generate_design_space; - - void VisitAttrs(tvm::AttrVisitor* v) { - // `f_initialize_with_tune_context` is not visited - // `f_generate_design_space` is not visited - } - - void InitializeWithTuneContext(const TuneContext& context) final; - Array GenerateDesignSpace(const IRModule& mod) final; - - static constexpr const char* _type_key = "meta_schedule.PySpaceGenerator"; - TVM_DECLARE_FINAL_OBJECT_INFO(PySpaceGeneratorNode, SpaceGeneratorNode); -}; - -/*! - * \brief Managed reference to SpaceGeneratorNode. - * \sa SpaceGeneratorNode - */ -class SpaceGenerator : public runtime::ObjectRef { protected: SpaceGenerator() = default; @@ -136,11 +129,12 @@ class SpaceGenerator : public runtime::ObjectRef { * \brief Create a design space generator with customized methods on the python-side. * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`. * \param f_generate_design_space The packed function of `GenerateDesignSpace`. + * \param f_clone The packed function of `Clone`. * \return The design space generator created. */ TVM_DLL static SpaceGenerator PySpaceGenerator( - PySpaceGeneratorNode::FInitializeWithTuneContext f_initialize_with_tune_context, - PySpaceGeneratorNode::FGenerateDesignSpace f_generate_design_space); + FInitializeWithTuneContext f_initialize_with_tune_context, + FGenerateDesignSpace f_generate_design_space, FClone f_clone); /*! * \brief Create a design space generator with customized schedule function. * \param schedule_fn The schedule function, which can have the following signatures: @@ -156,14 +150,40 @@ class SpaceGenerator : public runtime::ObjectRef { */ TVM_DLL static SpaceGenerator SpaceGeneratorUnion(Array space_generators); /*! - * \brief Create a design space generator that generates design spaces by applying schedule rules - * to blocks in post-DFS order. - * \return The design space generator created. + * \brief Create a design space generator that generates design spaces by applying schedule + * rules to blocks in post-DFS order. \return The design space generator created. */ TVM_DLL static SpaceGenerator PostOrderApply(runtime::PackedFunc f_block_filter = nullptr); TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(SpaceGenerator, ObjectRef, SpaceGeneratorNode); }; +/*! \brief The design space generator with customized methods on the python-side. */ +class PySpaceGeneratorNode : public SpaceGeneratorNode { + public: + using FInitializeWithTuneContext = SpaceGenerator::FInitializeWithTuneContext; + using FGenerateDesignSpace = SpaceGenerator::FGenerateDesignSpace; + using FClone = SpaceGenerator::FClone; + /*! \brief The packed function to the `InitializeWithTuneContext` function. */ + FInitializeWithTuneContext f_initialize_with_tune_context; + /*! \brief The packed function to the `GenerateDesignSpace` function. */ + FGenerateDesignSpace f_generate_design_space; + /*! \brief The packed function to the `Clone` function. */ + FClone f_clone; + + void VisitAttrs(tvm::AttrVisitor* v) { + // `f_initialize_with_tune_context` is not visited + // `f_generate_design_space` is not visited + // `f_clone` is not visited + } + + void InitializeWithTuneContext(const TuneContext& context) final; + Array GenerateDesignSpace(const IRModule& mod) final; + SpaceGenerator Clone() const final; + + static constexpr const char* _type_key = "meta_schedule.PySpaceGenerator"; + TVM_DECLARE_FINAL_OBJECT_INFO(PySpaceGeneratorNode, SpaceGeneratorNode); +}; + } // namespace meta_schedule } // namespace tvm diff --git a/include/tvm/meta_schedule/tune_context.h b/include/tvm/meta_schedule/tune_context.h index 3d732e7fbd99..4e2f00fb5a0c 100644 --- a/include/tvm/meta_schedule/tune_context.h +++ b/include/tvm/meta_schedule/tune_context.h @@ -43,6 +43,7 @@ namespace meta_schedule { class TaskSchedulerNode; class MeasureCallback; +class TuneContext; /*! \brief The auto tuning context. */ class TuneContextNode : public runtime::Object { @@ -99,6 +100,11 @@ class TuneContextNode : public runtime::Object { /*! \brief Initialize members that needs initialization with tune context. */ void Initialize(); + /*! + * \brief Clone the tune context. + * \return The cloned tune context. + */ + TuneContext Clone() const; /*! \brief Set the measure candidates from the SearchStrategy */ void _SetMeasureCandidates(const Array& candidates); /*! diff --git a/python/tvm/meta_schedule/mutator/mutator.py b/python/tvm/meta_schedule/mutator/mutator.py index 0c8de9668034..c5286aced7d8 100644 --- a/python/tvm/meta_schedule/mutator/mutator.py +++ b/python/tvm/meta_schedule/mutator/mutator.py @@ -58,6 +58,16 @@ def apply(self, trace: Trace) -> Optional[Trace]: """ return _ffi_api.MutatorApply(self, trace, -1) # type: ignore # pylint: disable=no-member + def clone(self) -> "Mutator": + """Clone the mutator. + + Returns + ------- + mutator : Mutator + The cloned mutator. + """ + return _ffi_api.MutatorClone(self) # type: ignore # pylint: disable=no-member + @register_object("meta_schedule.PyMutator") class _PyMutator(Mutator): @@ -72,6 +82,7 @@ def __init__( self, f_initialize_with_tune_context: Callable = None, f_apply: Callable = None, + f_clone: Callable = None, f_as_string: Callable = None, ): """Constructor.""" @@ -80,6 +91,7 @@ def __init__( _ffi_api.MutatorPyMutator, # type: ignore # pylint: disable=no-member f_initialize_with_tune_context, f_apply, + f_clone, f_as_string, ) @@ -94,7 +106,7 @@ class PyMutator: _tvm_metadata = { "cls": _PyMutator, - "methods": ["_initialize_with_tune_context", "apply", "__str__"], + "methods": ["_initialize_with_tune_context", "apply", "clone", "__str__"], } def _initialize_with_tune_context(self, context: "TuneContext") -> None: @@ -122,6 +134,16 @@ def apply(self, trace: Trace, _) -> Optional[Trace]: """ raise NotImplementedError + def clone(self) -> Mutator: + """Clone the mutator. + + Returns + ------- + mutator : Mutator + The cloned mutator. + """ + raise NotImplementedError + def __str__(self) -> str: """Get the mutator as string with name. diff --git a/python/tvm/meta_schedule/postproc/postproc.py b/python/tvm/meta_schedule/postproc/postproc.py index e37666bd1ce0..6eec2965ceeb 100644 --- a/python/tvm/meta_schedule/postproc/postproc.py +++ b/python/tvm/meta_schedule/postproc/postproc.py @@ -60,6 +60,16 @@ def apply(self, sch: Schedule) -> bool: """ return _ffi_api.PostprocApply(self, sch) # type: ignore # pylint: disable=no-member + def clone(self) -> "Postproc": + """Clone the postprocessor. + + Returns + ------- + cloned_postproc : Postproc + The cloned postprocessor. + """ + return _ffi_api.PostprocClone(self) # type: ignore # pylint: disable=no-member + @register_object("meta_schedule.PyPostproc") class _PyPostproc(Postproc): @@ -74,6 +84,7 @@ def __init__( self, f_initialize_with_tune_context: Callable = None, f_apply: Callable = None, + f_clone: Callable = None, f_as_string: Callable = None, ): """Constructor.""" @@ -82,6 +93,7 @@ def __init__( _ffi_api.PostprocPyPostproc, # type: ignore # pylint: disable=no-member f_initialize_with_tune_context, f_apply, + f_clone, f_as_string, ) @@ -96,7 +108,7 @@ class PyPostproc: _tvm_metadata = { "cls": _PyPostproc, - "methods": ["_initialize_with_tune_context", "apply", "__str__"], + "methods": ["_initialize_with_tune_context", "apply", "clone", "__str__"], } def _initialize_with_tune_context(self, context: "TuneContext") -> None: @@ -124,6 +136,16 @@ def apply(self, sch: Schedule) -> bool: """ raise NotImplementedError + def clone(self) -> Postproc: + """Clone the postprocessor. + + Returns + ------- + cloned_postproc : Postproc + The cloned postprocessor. + """ + raise NotImplementedError + def __str__(self) -> str: """Get the post processor as string with name. diff --git a/python/tvm/meta_schedule/schedule_rule/schedule_rule.py b/python/tvm/meta_schedule/schedule_rule/schedule_rule.py index 481444341b86..2c8e223611aa 100644 --- a/python/tvm/meta_schedule/schedule_rule/schedule_rule.py +++ b/python/tvm/meta_schedule/schedule_rule/schedule_rule.py @@ -66,6 +66,16 @@ def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]: self, sch, block ) + def clone(self) -> "ScheduleRule": + """Deep clone the schedule rule. + + Returns + ------- + cloned_rule : ScheduleRule + The cloned schedule rule. + """ + return _ffi_api.ScheduleRuleClone(self) # type: ignore # pylint: disable=no-member + @register_object("meta_schedule.PyScheduleRule") class _PyScheduleRule(ScheduleRule): @@ -80,6 +90,7 @@ def __init__( self, f_initialize_with_tune_context: Callable = None, f_apply: Callable = None, + f_clone: Callable = None, f_as_string: Callable = None, ): """Constructor.""" @@ -88,6 +99,7 @@ def __init__( _ffi_api.ScheduleRulePyScheduleRule, # type: ignore # pylint: disable=no-member f_initialize_with_tune_context, f_apply, + f_clone, f_as_string, ) @@ -102,7 +114,7 @@ class PyScheduleRule: _tvm_metadata = { "cls": _PyScheduleRule, - "methods": ["_initialize_with_tune_context", "apply", "__str__"], + "methods": ["_initialize_with_tune_context", "apply", "clone", "__str__"], } def _initialize_with_tune_context(self, context: "TuneContext") -> None: @@ -113,9 +125,7 @@ def _initialize_with_tune_context(self, context: "TuneContext") -> None: context : TuneContext The tuning context for initializing the schedule rule. """ - _ffi_api.ScheduleRuleInitializeWithTuneContext( # type: ignore # pylint: disable=no-member - self, context - ) + raise NotImplementedError def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]: """Apply a schedule rule to the specific block in the given schedule. @@ -132,9 +142,17 @@ def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]: design_spaces : List[Schedule] The list of schedules generated by applying the schedule rule. """ - return _ffi_api.ScheduleRuleApply( # type: ignore # pylint: disable=no-member - self, sch, block - ) + raise NotImplementedError + + def clone(self) -> ScheduleRule: + """Deep clone the schedule rule. + + Returns + ------- + cloned_rule : ScheduleRule + The cloned schedule rule. + """ + raise NotImplementedError def __str__(self) -> str: """Get the schedule rule as string with name. diff --git a/python/tvm/meta_schedule/search_strategy/search_strategy.py b/python/tvm/meta_schedule/search_strategy/search_strategy.py index e88cdf825a79..276e65713325 100644 --- a/python/tvm/meta_schedule/search_strategy/search_strategy.py +++ b/python/tvm/meta_schedule/search_strategy/search_strategy.py @@ -151,6 +151,16 @@ def notify_runner_results( results, ) + def clone(self) -> "SearchStrategy": + """Clone the search strategy. + + Returns + ------- + cloned : SearchStrategy + The cloned search strategy. + """ + return _ffi_api.SearchStrategyClone(self) # type: ignore # pylint: disable=no-member + @register_object("meta_schedule.PySearchStrategy") class _PySearchStrategy(SearchStrategy): @@ -168,6 +178,7 @@ def __init__( f_post_tuning: Callable = None, f_generate_measure_candidates: Callable = None, f_notify_runner_results: Callable = None, + f_clone: Callable = None, ): """Constructor.""" @@ -178,6 +189,7 @@ def __init__( f_post_tuning, f_generate_measure_candidates, f_notify_runner_results, + f_clone, ) @@ -197,6 +209,7 @@ class PySearchStrategy: "post_tuning", "generate_measure_candidates", "notify_runner_results", + "clone", ], } @@ -250,6 +263,16 @@ def notify_runner_results( """ raise NotImplementedError + def clone(self) -> SearchStrategy: + """Clone the search strategy. + + Returns + ------- + strategy : SearchStrategy + The cloned search strategy. + """ + raise NotImplementedError + def create( # pylint: disable=keyword-arg-before-vararg kind: Literal[ diff --git a/python/tvm/meta_schedule/space_generator/space_generator.py b/python/tvm/meta_schedule/space_generator/space_generator.py index 9d7ebf3bae26..23c0361645b5 100644 --- a/python/tvm/meta_schedule/space_generator/space_generator.py +++ b/python/tvm/meta_schedule/space_generator/space_generator.py @@ -72,6 +72,16 @@ def generate_design_space(self, mod: IRModule) -> List[Schedule]: """ return _ffi_api.SpaceGeneratorGenerateDesignSpace(self, mod) # type: ignore # pylint: disable=no-member + def clone(self) -> "SpaceGenerator": + """Clone the design space generator. + + Returns + ------- + cloned_sg : SpaceGenerator + The cloned design space generator. + """ + return _ffi_api.SpaceGeneratorClone(self) # type: ignore # pylint: disable=no-member + ScheduleFnType = SpaceGenerator.ScheduleFnType @@ -89,6 +99,7 @@ def __init__( self, f_initialize_with_tune_context: Optional[Callable] = None, f_generate_design_space: Optional[Callable] = None, + f_clone: Optional[Callable] = None, ): """Constructor.""" @@ -96,6 +107,7 @@ def __init__( _ffi_api.SpaceGeneratorPySpaceGenerator, # type: ignore # pylint: disable=no-member f_initialize_with_tune_context, f_generate_design_space, + f_clone, ) @@ -109,7 +121,7 @@ class PySpaceGenerator: _tvm_metadata = { "cls": _PySpaceGenerator, - "methods": ["_initialize_with_tune_context", "generate_design_space"], + "methods": ["_initialize_with_tune_context", "generate_design_space", "clone"], } def _initialize_with_tune_context(self, context: "TuneContext") -> None: @@ -137,6 +149,16 @@ def generate_design_space(self, mod: IRModule) -> List[Schedule]: """ raise NotImplementedError + def clone(self) -> SpaceGenerator: + """Clone the design space generator. + + Returns + ------- + cloned_sg : SpaceGenerator + The cloned design space generator. + """ + raise NotImplementedError + def create( # pylint: disable=keyword-arg-before-vararg kind: Union[ diff --git a/python/tvm/meta_schedule/testing/dummy_object.py b/python/tvm/meta_schedule/testing/dummy_object.py index 50ae974df5d8..bb2294544920 100644 --- a/python/tvm/meta_schedule/testing/dummy_object.py +++ b/python/tvm/meta_schedule/testing/dummy_object.py @@ -58,3 +58,6 @@ def _initialize_with_tune_context(self, context: "TuneContext") -> None: def apply(self, trace: Trace, _) -> Optional[Trace]: return Trace(trace.insts, {}) + + def clone(self): + return DummyMutator() diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py index 17acad8d4a57..29cd94110c0c 100644 --- a/python/tvm/meta_schedule/tune_context.py +++ b/python/tvm/meta_schedule/tune_context.py @@ -331,3 +331,13 @@ def notify_runner_results( "Please construct TuneContext with search_strategy" ) return self.search_strategy.notify_runner_results(measure_candidates, results) + + def clone(self) -> "TuneContext": + """Clone the TuneContext. + + Returns + ------- + cloned_context : TuneContext + The cloned TuneContext. + """ + return _ffi_api.TuneContextClone(self) # type: ignore # pylint: disable=no-member diff --git a/src/meta_schedule/mutator/mutate_compute_location.cc b/src/meta_schedule/mutator/mutate_compute_location.cc index 9d6d69ba355f..2a31d2da9b53 100644 --- a/src/meta_schedule/mutator/mutate_compute_location.cc +++ b/src/meta_schedule/mutator/mutate_compute_location.cc @@ -42,6 +42,11 @@ class MutateComputeLocationNode : public MutatorNode { } // Inherit from `MutatorNode` Optional Apply(const Trace& trace, TRandState* rand_state) final; + // Inherit from `MutatorNode` + Mutator Clone() const final { + ObjectPtr n = make_object(*this); + return Mutator(n); + } private: struct Candidate { diff --git a/src/meta_schedule/mutator/mutate_parallel.cc b/src/meta_schedule/mutator/mutate_parallel.cc index 82b91da682c6..9feb4747d807 100644 --- a/src/meta_schedule/mutator/mutate_parallel.cc +++ b/src/meta_schedule/mutator/mutate_parallel.cc @@ -188,6 +188,11 @@ class MutateParallelNode : public MutatorNode { } // Inherit from `MutatorNode` Optional Apply(const Trace& trace, TRandState* rand_state) final; + // Inherit from `MutatorNode` + Mutator Clone() const final { + ObjectPtr n = make_object(*this); + return Mutator(n); + } }; /*! \brief The candidate to be mutated */ diff --git a/src/meta_schedule/mutator/mutate_thread_binding.cc b/src/meta_schedule/mutator/mutate_thread_binding.cc index de780b53e2d9..f5d89a85092b 100644 --- a/src/meta_schedule/mutator/mutate_thread_binding.cc +++ b/src/meta_schedule/mutator/mutate_thread_binding.cc @@ -42,6 +42,11 @@ class MutateThreadBindingNode : public MutatorNode { } // Inherit from `MutatorNode` Optional Apply(const Trace& trace, TRandState* rand_state) final; + // Inherit from `MutatorNode` + Mutator Clone() const final { + ObjectPtr n = make_object(*this); + return Mutator(n); + } private: struct Candidate { diff --git a/src/meta_schedule/mutator/mutate_tile_size.cc b/src/meta_schedule/mutator/mutate_tile_size.cc index 4a3bfda8a4a8..8fb83147ea7b 100644 --- a/src/meta_schedule/mutator/mutate_tile_size.cc +++ b/src/meta_schedule/mutator/mutate_tile_size.cc @@ -63,6 +63,11 @@ class MutateTileSizeNode : public MutatorNode { void InitializeWithTuneContext(const TuneContext& context) final {} // Inherit from `MutatorNode` Optional Apply(const Trace& trace, TRandState* rand_state) final; + // Inherit from `MutatorNode` + Mutator Clone() const final { + ObjectPtr n = make_object(*this); + return Mutator(n); + } }; /*! diff --git a/src/meta_schedule/mutator/mutate_unroll.cc b/src/meta_schedule/mutator/mutate_unroll.cc index c282a171c3b7..7bbf00343af3 100644 --- a/src/meta_schedule/mutator/mutate_unroll.cc +++ b/src/meta_schedule/mutator/mutate_unroll.cc @@ -60,6 +60,11 @@ class MutateUnrollNode : public MutatorNode { void InitializeWithTuneContext(const TuneContext& context) final {} // Inherit from `MutatorNode` Optional Apply(const Trace& trace, TRandState* rand_state) final; + // Inherit from `MutatorNode` + Mutator Clone() const final { + ObjectPtr n = make_object(*this); + return Mutator(n); + } }; /*! \brief A candidate to be mutated */ diff --git a/src/meta_schedule/mutator/mutator.cc b/src/meta_schedule/mutator/mutator.cc index 43b95000c71d..25312ab61f99 100644 --- a/src/meta_schedule/mutator/mutator.cc +++ b/src/meta_schedule/mutator/mutator.cc @@ -33,13 +33,20 @@ Optional PyMutatorNode::Apply( return f_apply(trace, *rand_state); } +Mutator PyMutatorNode::Clone() const { + ICHECK(f_clone != nullptr) << "PyMutator's Clone method not implemented!"; + return f_clone(); +} + Mutator Mutator::PyMutator( PyMutatorNode::FInitializeWithTuneContext f_initialize_with_tune_context, // PyMutatorNode::FApply f_apply, // + PyMutatorNode::FClone f_clone, // PyMutatorNode::FAsString f_as_string) { ObjectPtr n = make_object(); n->f_initialize_with_tune_context = std::move(f_initialize_with_tune_context); n->f_apply = std::move(f_apply); + n->f_clone = std::move(f_clone); n->f_as_string = std::move(f_as_string); return Mutator(n); } @@ -63,6 +70,7 @@ TVM_REGISTER_GLOBAL("meta_schedule.MutatorApply") TRandState seed_ = (seed != -1) ? seed : support::LinearCongruentialEngine::DeviceRandom(); return self->Apply(trace, &seed_); }); +TVM_REGISTER_GLOBAL("meta_schedule.MutatorClone").set_body_method(&MutatorNode::Clone); TVM_REGISTER_GLOBAL("meta_schedule.MutatorPyMutator").set_body_typed(Mutator::PyMutator); } // namespace meta_schedule diff --git a/src/meta_schedule/postproc/disallow_dynamic_loop.cc b/src/meta_schedule/postproc/disallow_dynamic_loop.cc index 85a81f10fdcd..8362da552ea5 100644 --- a/src/meta_schedule/postproc/disallow_dynamic_loop.cc +++ b/src/meta_schedule/postproc/disallow_dynamic_loop.cc @@ -67,6 +67,11 @@ class DisallowDynamicLoopNode : public PostprocNode { void InitializeWithTuneContext(const TuneContext& context) final {} // Inherited from PostprocNode bool Apply(const tir::Schedule& sch) final { return !tir::DynamicExtentFinder::Find(sch->mod()); } + // Inherited from PostprocNode + Postproc Clone() const { + ObjectPtr n = make_object(*this); + return Postproc(n); + } static constexpr const char* _type_key = "meta_schedule.DisallowDynamicLoop"; TVM_DECLARE_FINAL_OBJECT_INFO(DisallowDynamicLoopNode, PostprocNode); diff --git a/src/meta_schedule/postproc/postproc.cc b/src/meta_schedule/postproc/postproc.cc index 0f4f1b1192f6..957d6e7364e4 100644 --- a/src/meta_schedule/postproc/postproc.cc +++ b/src/meta_schedule/postproc/postproc.cc @@ -32,13 +32,20 @@ bool PyPostprocNode::Apply(const tir::Schedule& sch) { return f_apply(sch); } +Postproc PyPostprocNode::Clone() const { + ICHECK(f_clone != nullptr) << "PyPostproc's Clone method not implemented!"; + return f_clone(); +} + Postproc Postproc::PyPostproc( PyPostprocNode::FInitializeWithTuneContext f_initialize_with_tune_context, // PyPostprocNode::FApply f_apply, // + PyPostprocNode::FClone f_clone, // PyPostprocNode::FAsString f_as_string) { ObjectPtr n = make_object(); n->f_initialize_with_tune_context = std::move(f_initialize_with_tune_context); n->f_apply = std::move(f_apply); + n->f_clone = std::move(f_clone); n->f_as_string = std::move(f_as_string); return Postproc(n); } @@ -58,6 +65,7 @@ TVM_REGISTER_NODE_TYPE(PyPostprocNode); TVM_REGISTER_GLOBAL("meta_schedule.PostprocInitializeWithTuneContext") .set_body_method(&PostprocNode::InitializeWithTuneContext); TVM_REGISTER_GLOBAL("meta_schedule.PostprocApply").set_body_method(&PostprocNode::Apply); +TVM_REGISTER_GLOBAL("meta_schedule.PostprocClone").set_body_method(&PostprocNode::Clone); TVM_REGISTER_GLOBAL("meta_schedule.PostprocPyPostproc").set_body_typed(Postproc::PyPostproc); } // namespace meta_schedule diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc index d111bdb42abb..ac9f45ca8ef4 100644 --- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc +++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc @@ -104,6 +104,11 @@ class RewriteCooperativeFetchNode : public PostprocNode { // Inherited from PostprocNode bool Apply(const tir::Schedule& sch) final; + Postproc Clone() const { + ObjectPtr n = make_object(*this); + return Postproc(n); + } + void VisitAttrs(tvm::AttrVisitor* v) {} static constexpr const char* _type_key = "meta_schedule.RewriteCooperativeFetch"; diff --git a/src/meta_schedule/postproc/rewrite_layout.cc b/src/meta_schedule/postproc/rewrite_layout.cc index f4cbdfe737fb..6ff9958c791f 100644 --- a/src/meta_schedule/postproc/rewrite_layout.cc +++ b/src/meta_schedule/postproc/rewrite_layout.cc @@ -167,6 +167,11 @@ class RewriteLayoutNode : public PostprocNode { // Inherited from PostprocNode bool Apply(const tir::Schedule& sch) final { return tir::RewriteLayout(sch); } + Postproc Clone() const { + ObjectPtr n = make_object(*this); + return Postproc(n); + } + static constexpr const char* _type_key = "meta_schedule.RewriteLayout"; TVM_DECLARE_FINAL_OBJECT_INFO(RewriteLayoutNode, PostprocNode); }; diff --git a/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc b/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc index 08d25d017840..c3cc0ef60152 100644 --- a/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc +++ b/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc @@ -384,6 +384,12 @@ class RewriteParallelVectorizeUnrollNode : public PostprocNode { return true; } + Postproc Clone() const { + ObjectPtr n = + make_object(*this); + return Postproc(n); + } + static constexpr const char* _type_key = "meta_schedule.RewriteParallelVectorizeUnroll"; TVM_DECLARE_FINAL_OBJECT_INFO(RewriteParallelVectorizeUnrollNode, PostprocNode); }; diff --git a/src/meta_schedule/postproc/rewrite_reduction_block.cc b/src/meta_schedule/postproc/rewrite_reduction_block.cc index ea204e306133..05a7640f047c 100644 --- a/src/meta_schedule/postproc/rewrite_reduction_block.cc +++ b/src/meta_schedule/postproc/rewrite_reduction_block.cc @@ -114,6 +114,11 @@ class RewriteReductionBlockNode : public PostprocNode { // Inherited from PostprocNode bool Apply(const tir::Schedule& sch) final; + Postproc Clone() const { + ObjectPtr n = make_object(*this); + return Postproc(n); + } + void VisitAttrs(tvm::AttrVisitor* v) {} static constexpr const char* _type_key = "meta_schedule.RewriteReductionBlock"; diff --git a/src/meta_schedule/postproc/rewrite_tensorize.cc b/src/meta_schedule/postproc/rewrite_tensorize.cc index 3b6c438d0216..4f8e0fb213f8 100644 --- a/src/meta_schedule/postproc/rewrite_tensorize.cc +++ b/src/meta_schedule/postproc/rewrite_tensorize.cc @@ -68,6 +68,11 @@ class RewriteTensorizeNode : public PostprocNode { void VisitAttrs(tvm::AttrVisitor* v) {} + Postproc Clone() const { + ObjectPtr n = make_object(*this); + return Postproc(n); + } + bool vectorize_init_loop = false; static constexpr const char* _type_key = "meta_schedule.RewriteTensorize"; diff --git a/src/meta_schedule/postproc/rewrite_unbound_block.cc b/src/meta_schedule/postproc/rewrite_unbound_block.cc index eb57e90f82f6..1ba68538ea04 100644 --- a/src/meta_schedule/postproc/rewrite_unbound_block.cc +++ b/src/meta_schedule/postproc/rewrite_unbound_block.cc @@ -97,6 +97,11 @@ class RewriteUnboundBlockNode : public PostprocNode { // Inherited from PostprocNode bool Apply(const tir::Schedule& sch) final; + Postproc Clone() const { + ObjectPtr n = make_object(*this); + return Postproc(n); + } + public: /*! \brief The max number of threads per block from Target */ int max_threads_per_block_ = -1; diff --git a/src/meta_schedule/postproc/verify_gpu_code.cc b/src/meta_schedule/postproc/verify_gpu_code.cc index dfe2c5a06a17..0828ee538427 100644 --- a/src/meta_schedule/postproc/verify_gpu_code.cc +++ b/src/meta_schedule/postproc/verify_gpu_code.cc @@ -196,6 +196,12 @@ class VerifyGPUCodeNode : public PostprocNode { return true; } + Postproc Clone() const { + ObjectPtr n = make_object(*this); + n->target_constraints_ = this->target_constraints_; + return Postproc(n); + } + static constexpr const char* _type_key = "meta_schedule.VerifyGPUCode"; TVM_DECLARE_FINAL_OBJECT_INFO(VerifyGPUCodeNode, PostprocNode); }; diff --git a/src/meta_schedule/schedule_rule/add_rfactor.cc b/src/meta_schedule/schedule_rule/add_rfactor.cc index cf87f24ac233..2fc1352677cb 100644 --- a/src/meta_schedule/schedule_rule/add_rfactor.cc +++ b/src/meta_schedule/schedule_rule/add_rfactor.cc @@ -36,6 +36,12 @@ class AddRFactorNode : public ScheduleRuleNode { // Inherited from ScheduleRuleNode Array Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv); + // Inherited from ScheduleRuleNode + ScheduleRule Clone() const final { + ObjectPtr n = make_object(*this); + return ScheduleRule(n); + } + public: /*! * \brief The maximum number of jobs to be launched per core. diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc index d8f52fa8e1de..7af1418d8f3e 100644 --- a/src/meta_schedule/schedule_rule/auto_bind.cc +++ b/src/meta_schedule/schedule_rule/auto_bind.cc @@ -177,6 +177,12 @@ class AutoBindNode : public ScheduleRuleNode { // Inherited from ScheduleRuleNode Array Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) final; + // Inherited from ScheduleRuleNode + ScheduleRule Clone() const final { + ObjectPtr n = make_object(*this); + return ScheduleRule(n); + } + public: /*! \brief The max number of threads per block from Target */ int64_t max_threads_per_block_ = -1; diff --git a/src/meta_schedule/schedule_rule/auto_inline.cc b/src/meta_schedule/schedule_rule/auto_inline.cc index 446c8ead7e8e..dcdc83f95cb1 100644 --- a/src/meta_schedule/schedule_rule/auto_inline.cc +++ b/src/meta_schedule/schedule_rule/auto_inline.cc @@ -60,6 +60,12 @@ class AutoInlineNode : public ScheduleRuleNode { return {sch}; } + // Inherited from ScheduleRuleNode + ScheduleRule Clone() const final { + ObjectPtr n = make_object(*this); + return ScheduleRule(n); + } + public: /*! \brief If allows to inline a block into its producer */ bool into_producer; diff --git a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc index 35be33f72e21..f2fc67f74cc7 100644 --- a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc +++ b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc @@ -113,6 +113,12 @@ class CrossThreadReductionNode : public ScheduleRuleNode { return {tmp_sch, sch}; } + // Inherited from ScheduleRuleNode + ScheduleRule Clone() const final { + ObjectPtr n = make_object(*this); + return ScheduleRule(n); + } + private: /*! * \brief Check whether the input block is in thread scope, i.e., some of its outer loop is diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc index c126c854462c..1625a27b9aaf 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc @@ -104,6 +104,12 @@ Array MultiLevelTilingNode::Apply(const Schedule& sch, const BlockRV& return results; } +// Inherited from ScheduleRuleNode +ScheduleRule MultiLevelTilingNode::Clone() const { + ObjectPtr n = make_object(*this); + return ScheduleRule(n); +} + std::vector MultiLevelTilingNode::ApplySubRules(std::vector states) { states = SubRule(std::move(states), [&](State state) { return TileLoopNest(std::move(state)); }); states = SubRule(std::move(states), [&](State state) { return AddWriteReuse(std::move(state)); }); diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.h b/src/meta_schedule/schedule_rule/multi_level_tiling.h index 9161a972c187..47da878c3be0 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling.h +++ b/src/meta_schedule/schedule_rule/multi_level_tiling.h @@ -155,6 +155,9 @@ class MultiLevelTilingNode : public ScheduleRuleNode { // Entry of the mega rule; Inherited from ScheduleRuleNode Array Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) override; + // Inherited from ScheduleRuleNode + ScheduleRule Clone() const override; + protected: virtual std::vector ApplySubRules(std::vector states); diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc index 7ddda9b2635b..13b00fa7deb6 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc @@ -137,6 +137,13 @@ class MultiLevelTilingTensorCoreNode : public MultiLevelTilingNode { // Override Apply to apply tensorization-specific analysis before applying sub-rules Array Apply(const Schedule& sch, const BlockRV& block_rv) final; + // Inherited from ScheduleRuleNode + ScheduleRule Clone() const final { + ObjectPtr n = + make_object(*this); + return ScheduleRule(n); + } + /*! * \brief Transform and tensorize with the given tensor intrin * \param state The state of the meta schedule rule diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc index 3a299ed041e2..b953d1ad4b50 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc @@ -63,6 +63,13 @@ class MultiLevelTilingWithIntrinNode : public MultiLevelTilingNode { return res; } + // Inherited from ScheduleRuleNode + ScheduleRule Clone() const final { + ObjectPtr n = + make_object(*this); + return ScheduleRule(n); + } + // Override ApplySubRules to tile the inner loops according to the given tensor intrinsic, then // tile the outerloops. virtual std::vector ApplySubRules(std::vector states) { diff --git a/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc b/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc index 19758996e608..045aa85b73ad 100644 --- a/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc +++ b/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc @@ -79,6 +79,13 @@ class ParallelizeVectorizeUnrollNode : public ScheduleRuleNode { return {sch}; } + // Inherited from ScheduleRuleNode + ScheduleRule Clone() const final { + ObjectPtr n = + make_object(*this); + return ScheduleRule(n); + } + public: /*! * \brief The maximum number of jobs to be launched per CPU core. It sets the diff --git a/src/meta_schedule/schedule_rule/random_compute_location.cc b/src/meta_schedule/schedule_rule/random_compute_location.cc index 65988dfd5688..7796eddd44d3 100644 --- a/src/meta_schedule/schedule_rule/random_compute_location.cc +++ b/src/meta_schedule/schedule_rule/random_compute_location.cc @@ -57,6 +57,12 @@ class RandomComputeLocationNode : public ScheduleRuleNode { return {res}; } + // Inherited from ScheduleRuleNode + ScheduleRule Clone() const final { + ObjectPtr n = make_object(*this); + return ScheduleRule(n); + } + private: bool CheckConditions(const tir::Schedule sch, const tir::BlockRV& block_rv) const { tir::StmtSRef block_sref = sch->GetSRef(block_rv); diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc index 80f8725b0c0d..416b43f46d56 100644 --- a/src/meta_schedule/schedule_rule/schedule_rule.cc +++ b/src/meta_schedule/schedule_rule/schedule_rule.cc @@ -33,13 +33,20 @@ Array PyScheduleRuleNode::Apply(const tir::Schedule& sch, return f_apply(sch, block); } +ScheduleRule PyScheduleRuleNode::Clone() const { + ICHECK(f_clone != nullptr) << "PyScheduleRule's Clone method not implemented!"; + return f_clone(); +} + ScheduleRule ScheduleRule::PyScheduleRule( PyScheduleRuleNode::FInitializeWithTuneContext f_initialize_with_tune_context, // PyScheduleRuleNode::FApply f_apply, // + PyScheduleRuleNode::FClone f_clone, // PyScheduleRuleNode::FAsString f_as_string) { ObjectPtr n = make_object(); n->f_initialize_with_tune_context = std::move(f_initialize_with_tune_context); n->f_apply = std::move(f_apply); + n->f_clone = std::move(f_clone); n->f_as_string = std::move(f_as_string); return ScheduleRule(n); } @@ -60,6 +67,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleInitializeWithTuneContext") .set_body_method(&ScheduleRuleNode::InitializeWithTuneContext); TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleApply") .set_body_method(&ScheduleRuleNode::Apply); +TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleClone") + .set_body_method(&ScheduleRuleNode::Clone); TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRulePyScheduleRule") .set_body_typed(ScheduleRule::PyScheduleRule); diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc index c5ff9008effe..5930704eb0d1 100644 --- a/src/meta_schedule/search_strategy/evolutionary_search.cc +++ b/src/meta_schedule/search_strategy/evolutionary_search.cc @@ -431,6 +431,24 @@ class EvolutionarySearchNode : public SearchStrategyNode { ICHECK(this->state_ != nullptr); this->state_->NotifyRunnerResults(measure_candidates, results); } + + SearchStrategy Clone() const final { + ObjectPtr n = make_object(); + n->max_trials_per_task = this->max_trials_per_task; + n->num_trials_per_iter = this->num_trials_per_iter; + n->population_size = this->population_size; + n->num_empty_iters_before_early_stop = this->num_empty_iters_before_early_stop; + n->init_measured_ratio = this->init_measured_ratio; + n->init_min_unmeasured = this->init_min_unmeasured; + n->genetic_num_iters = this->genetic_num_iters; + n->genetic_mutate_prob = this->genetic_mutate_prob; + n->genetic_max_fail_count = this->genetic_max_fail_count; + n->eps_greedy = this->eps_greedy; + n->context_ = this->context_; + n->rand_state_ = this->rand_state_; + n->state_ = nullptr; // cleared the state + return SearchStrategy(n); + } }; std::vector EvolutionarySearchNode::State::PickBestFromDatabase(int num) { diff --git a/src/meta_schedule/search_strategy/replay_func.cc b/src/meta_schedule/search_strategy/replay_func.cc index 4574c1c817a8..6914ab2f0f0a 100644 --- a/src/meta_schedule/search_strategy/replay_func.cc +++ b/src/meta_schedule/search_strategy/replay_func.cc @@ -100,6 +100,16 @@ class ReplayFuncNode : public SearchStrategyNode { ICHECK(this->state_ != nullptr); this->state_->NotifyRunnerResults(results); } + + SearchStrategy Clone() const final { + ObjectPtr n = make_object(); + n->num_trials_per_iter = this->num_trials_per_iter; + n->max_trials_per_task = this->max_trials_per_task; + n->context_ = this->context_; + n->rand_state_ = this->rand_state_; + n->state_ = nullptr; // cleared the state + return SearchStrategy(n); + } }; inline Optional> ReplayFuncNode::State::GenerateMeasureCandidates() { diff --git a/src/meta_schedule/search_strategy/replay_trace.cc b/src/meta_schedule/search_strategy/replay_trace.cc index 64fc68394357..bd553bf037d1 100644 --- a/src/meta_schedule/search_strategy/replay_trace.cc +++ b/src/meta_schedule/search_strategy/replay_trace.cc @@ -118,6 +118,17 @@ class ReplayTraceNode : public SearchStrategyNode { ICHECK(this->state_ != nullptr); this->state_->NotifyRunnerResults(results); } + + SearchStrategy Clone() const final { + ObjectPtr n = make_object(); + n->num_trials_per_iter = this->num_trials_per_iter; + n->max_trials_per_task = this->max_trials_per_task; + n->max_fail_count = this->max_fail_count; + n->context_ = this->context_; + n->rand_state_ = this->rand_state_; + n->state_ = nullptr; // cleared the state + return SearchStrategy(n); + } }; inline Optional> ReplayTraceNode::State::GenerateMeasureCandidates() { diff --git a/src/meta_schedule/search_strategy/search_strategy.cc b/src/meta_schedule/search_strategy/search_strategy.cc index 5865fc842248..81c7fda315b4 100644 --- a/src/meta_schedule/search_strategy/search_strategy.cc +++ b/src/meta_schedule/search_strategy/search_strategy.cc @@ -59,18 +59,25 @@ void PySearchStrategyNode::NotifyRunnerResults(const Array& me f_notify_runner_results(measure_candidates, results); } +SearchStrategy PySearchStrategyNode::Clone() const { + ICHECK(f_clone != nullptr) << "PySearchStrategy's Clone method not implemented!"; + return f_clone(); +} + SearchStrategy SearchStrategy::PySearchStrategy( PySearchStrategyNode::FInitializeWithTuneContext f_initialize_with_tune_context, // PySearchStrategyNode::FPreTuning f_pre_tuning, // PySearchStrategyNode::FPostTuning f_post_tuning, // PySearchStrategyNode::FGenerateMeasureCandidates f_generate_measure_candidates, // - PySearchStrategyNode::FNotifyRunnerResults f_notify_runner_results) { + PySearchStrategyNode::FNotifyRunnerResults f_notify_runner_results, // + PySearchStrategyNode::FClone f_clone) { ObjectPtr n = make_object(); n->f_initialize_with_tune_context = f_initialize_with_tune_context; n->f_pre_tuning = f_pre_tuning; n->f_post_tuning = f_post_tuning; n->f_generate_measure_candidates = f_generate_measure_candidates; n->f_notify_runner_results = f_notify_runner_results; + n->f_clone = f_clone; return SearchStrategy(n); } @@ -94,6 +101,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.SearchStrategyGenerateMeasureCandidates") .set_body_method(&SearchStrategyNode::GenerateMeasureCandidates); TVM_REGISTER_GLOBAL("meta_schedule.SearchStrategyNotifyRunnerResults") .set_body_method(&SearchStrategyNode::NotifyRunnerResults); +TVM_REGISTER_GLOBAL("meta_schedule.SearchStrategyClone") + .set_body_method(&SearchStrategyNode::Clone); } // namespace meta_schedule } // namespace tvm diff --git a/src/meta_schedule/space_generator/post_order_apply.cc b/src/meta_schedule/space_generator/post_order_apply.cc index 9be89e2d9c70..991e4fa08047 100644 --- a/src/meta_schedule/space_generator/post_order_apply.cc +++ b/src/meta_schedule/space_generator/post_order_apply.cc @@ -188,6 +188,15 @@ class PostOrderApplyNode : public SpaceGeneratorNode { } return result; } + + SpaceGenerator Clone() const final { + ObjectPtr n = make_object(*this); + n->sch_rules_ = Array(); + for (const ScheduleRule& sch_rule : this->sch_rules_) { + n->sch_rules_.push_back(sch_rule->Clone()); + } + return SpaceGenerator(n); + } static constexpr const char* _type_key = "meta_schedule.PostOrderApply"; TVM_DECLARE_FINAL_OBJECT_INFO(PostOrderApplyNode, SpaceGeneratorNode); }; diff --git a/src/meta_schedule/space_generator/schedule_fn.cc b/src/meta_schedule/space_generator/schedule_fn.cc index 70559fbcf1fb..adea139b1cd4 100644 --- a/src/meta_schedule/space_generator/schedule_fn.cc +++ b/src/meta_schedule/space_generator/schedule_fn.cc @@ -72,6 +72,11 @@ class ScheduleFnNode : public SpaceGeneratorNode { throw; } + SpaceGenerator Clone() const final { + ObjectPtr n = make_object(*this); + return SpaceGenerator(n); + } + static constexpr const char* _type_key = "meta_schedule.ScheduleFn"; TVM_DECLARE_FINAL_OBJECT_INFO(ScheduleFnNode, SpaceGeneratorNode); }; diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc index 5c5ab6ebbae5..6fc31ed896f2 100644 --- a/src/meta_schedule/space_generator/space_generator.cc +++ b/src/meta_schedule/space_generator/space_generator.cc @@ -33,12 +33,18 @@ Array PySpaceGeneratorNode::GenerateDesignSpace(const IRModule& m return f_generate_design_space(mod); } +SpaceGenerator PySpaceGeneratorNode::Clone() const { + ICHECK(f_clone != nullptr) << "PySpaceGenerator's Clone method not implemented!"; + return f_clone(); +} + SpaceGenerator SpaceGenerator::PySpaceGenerator( - PySpaceGeneratorNode::FInitializeWithTuneContext f_initialize_with_tune_context, - PySpaceGeneratorNode::FGenerateDesignSpace f_generate_design_space) { + FInitializeWithTuneContext f_initialize_with_tune_context, + FGenerateDesignSpace f_generate_design_space, FClone f_clone) { ObjectPtr n = make_object(); n->f_initialize_with_tune_context = std::move(f_initialize_with_tune_context); n->f_generate_design_space = std::move(f_generate_design_space); + n->f_clone = std::move(f_clone); return SpaceGenerator(n); } @@ -51,6 +57,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.SpaceGeneratorGenerateDesignSpace") .set_body_method(&SpaceGeneratorNode::GenerateDesignSpace); TVM_REGISTER_GLOBAL("meta_schedule.SpaceGeneratorPySpaceGenerator") .set_body_typed(SpaceGenerator::PySpaceGenerator); +TVM_REGISTER_GLOBAL("meta_schedule.SpaceGeneratorClone") + .set_body_method(&SpaceGeneratorNode::Clone); } // namespace meta_schedule } // namespace tvm diff --git a/src/meta_schedule/space_generator/space_generator_union.cc b/src/meta_schedule/space_generator/space_generator_union.cc index 6ea61824f932..771d0c187f97 100644 --- a/src/meta_schedule/space_generator/space_generator_union.cc +++ b/src/meta_schedule/space_generator/space_generator_union.cc @@ -47,6 +47,15 @@ class SpaceGeneratorUnionNode : public SpaceGeneratorNode { return design_spaces; } + SpaceGenerator Clone() const final { + ObjectPtr n = make_object(*this); + n->space_generators = Array(); + for (const SpaceGenerator& space_generator : this->space_generators) { + n->space_generators.push_back(space_generator->Clone()); + } + return SpaceGenerator(n); + } + static constexpr const char* _type_key = "meta_schedule.SpaceGeneratorUnion"; TVM_DECLARE_FINAL_OBJECT_INFO(SpaceGeneratorUnionNode, SpaceGeneratorNode); }; diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc index 57b2344c6f8d..3650c0374dab 100644 --- a/src/meta_schedule/tune_context.cc +++ b/src/meta_schedule/tune_context.cc @@ -52,6 +52,32 @@ TuneContext::TuneContext(Optional mod, data_ = std::move(n); } +TuneContext TuneContextNode::Clone() const { + ObjectPtr n = make_object(*this); + if (this->sch_rules.defined()) { + n->sch_rules = Array(); + for (const ScheduleRule& sch_rule : this->sch_rules) { + n->sch_rules.push_back(sch_rule->Clone()); + } + } + if (this->postprocs.defined()) { + n->postprocs = Array(); + for (const Postproc& postproc : this->postprocs) { + n->postprocs.push_back(postproc->Clone()); + } + } + if (this->mutator_probs.defined()) { + n->mutator_probs = Map(); + for (const auto& kv : this->mutator_probs) { + n->mutator_probs.Set(kv.first->Clone(), kv.second); + } + } + if (this->space_generator.defined()) n->space_generator = this->space_generator.value()->Clone(); + if (this->search_strategy.defined()) n->search_strategy = this->search_strategy.value()->Clone(); + n->Initialize(); + return TuneContext(n); +} + void TuneContextNode::Initialize() { if (this->space_generator.defined()) { this->space_generator.value()->InitializeWithTuneContext(GetRef(this)); From 77d0a288df4a1975784def14b316bde576fe3980 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Thu, 15 Sep 2022 23:28:45 -0700 Subject: [PATCH 183/704] [MetaSchedule][Test] MLT uses SEqual tests (#12805) This PR finishes migration from `check_trace` (string-based equality check on TIR trace) to `check_sketch` (SEqual-based equality check on TIR). Here, we split multi-level-tiling into 3 files: - Plain multi-level tiling without any intrinsics - Multi-level tiling with intrinsics like VNNI, DP4a - Multi-level tiling with TensorCore which comes with different handling Besides, we cleaned up the testing folder and removed several methods that are no longer useful for unittests. --- .../meta_schedule/testing/schedule_rule.py | 138 +- .../multi_level_tiling_tensor_core.cc | 4 +- src/meta_schedule/utils.h | 35 +- ...t_meta_schedule_schedule_rule_auto_bind.py | 22 +- ...meta_schedule_schedule_rule_auto_inline.py | 19 +- ...le_schedule_rule_cross_thread_reduction.py | 17 +- .../test_meta_schedule_schedule_rule_mlt.py | 529 ++++++++ ..._meta_schedule_schedule_rule_mlt_intrin.py | 418 ++++++ ...test_meta_schedule_schedule_rule_mlt_tc.py | 957 +++++++++++++ ...hedule_schedule_rule_multi_level_tiling.py | 1205 ----------------- 10 files changed, 1961 insertions(+), 1383 deletions(-) create mode 100644 tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py create mode 100644 tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py create mode 100644 tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py delete mode 100644 tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py index 12ca4200d77a..f14e90b6f0b2 100644 --- a/python/tvm/meta_schedule/testing/schedule_rule.py +++ b/python/tvm/meta_schedule/testing/schedule_rule.py @@ -15,122 +15,22 @@ # specific language governing permissions and limitations # under the License. """Default schedule rules""" -from typing import List, Union - -from tvm.meta_schedule.schedule_rule import ( - AutoInline, - MultiLevelTiling, - MultiLevelTilingTensorCore, - ReuseType, - ScheduleRule, -) -from tvm.target import Target - - -def auto_inline(target: Target) -> ScheduleRule: - """Default schedule rules for auto inline""" - if target.kind.name == "llvm": - return AutoInline( - into_producer=False, - into_consumer=True, - inline_const_tensor=True, - disallow_if_then_else=True, - require_injective=True, - require_ordered=True, - disallow_op=["tir.exp"], - ) - if target.kind.name == "cuda": - return AutoInline( - into_producer=True, - into_consumer=True, - inline_const_tensor=True, - disallow_if_then_else=False, - require_injective=False, - require_ordered=False, - disallow_op=None, - ) - raise NotImplementedError(f"{target.kind.name} is not supported") - - -def multi_level_tiling(target: Target) -> ScheduleRule: - """Default schedule rules for with multi-level tiling and reuse""" - if target.kind.name == "llvm": - return MultiLevelTiling( - structure="SSRSRS", - tile_binds=None, - max_innermost_factor=64, - vector_load_lens=None, - reuse_read=None, - reuse_write=ReuseType( - req="may", - levels=[1, 2], - scope="global", - ), - ) - if target.kind.name == "cuda": - return MultiLevelTiling( - structure="SSSRRSRS", - tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"], - max_innermost_factor=64, - vector_load_lens=[1, 2, 3, 4, 8, 16], - reuse_read=ReuseType( - req="must", - levels=[4], - scope="shared", - ), - reuse_write=ReuseType( - req="must", - levels=[3], - scope="local", - ), - ) - raise NotImplementedError(f"{target.kind.name} is not supported") - - -def multi_level_tiling_tensor_core( - target: Target, - write_reuse_scope: str = "shared", - in_dtype: Union[str, List[str]] = "float16", - out_dtype: Union[str, List[str]] = "float32", - trans_b: Union[bool, List[bool]] = False, - use_software_pipeline: bool = False, -) -> ScheduleRule: - """Default schedule rules for with multi-level tiling reuse for tensor core""" - assert write_reuse_scope in ["shared", "global"] - if not isinstance(in_dtype, list): - in_dtype = [in_dtype] - if not isinstance(out_dtype, list): - out_dtype = [out_dtype] - if not isinstance(trans_b, list): - trans_b = [trans_b] - - if target.kind.name == "cuda": - from tvm.tir.tensor_intrin import ( # pylint: disable=import-outside-toplevel - cuda, - ) - - intrin_groups = [ - cuda.get_wmma_intrin_group(write_reuse_scope, _in_dtype, _out_dtype, _trans_b) - for _in_dtype in in_dtype - for _out_dtype in out_dtype - for _trans_b in trans_b - ] - return MultiLevelTilingTensorCore( - intrin_groups=intrin_groups, - structure="SSSRRSRS", - tile_binds=["blockIdx.y", "blockIdx.x", "threadIdx.y"], - max_innermost_factor=4, # 64 // tensor intrin size - vector_load_lens=[1, 2, 3, 4, 8, 16], - reuse_read=ReuseType( - req="must", - levels=[4], - scope="shared", - ), - reuse_write=ReuseType( - req="must" if write_reuse_scope == "shared" else "no", - levels=[2], - scope=write_reuse_scope, - ), - use_software_pipeline=use_software_pipeline, - ) - raise NotImplementedError(f"{target.kind.name} is not supported") +from typing import List, Tuple, Union + +from tvm.meta_schedule import default_config +from tvm.meta_schedule.schedule_rule import ScheduleRule + + +def get_rules(kind: str, types: Union[type, Tuple[type, ...]]) -> List[ScheduleRule]: + """Get default schedule rules""" + # pylint: disable=protected-access + if kind == "llvm": + rules = default_config._DefaultLLVM.schedule_rules() + elif kind == "cuda": + rules = default_config._DefaultCUDA.schedule_rules() + elif kind == "tensor_core": + rules = default_config._DefaultCUDATensorCore.schedule_rules() + else: + raise NotImplementedError(f"{kind} is not supported") + # pylint: enable=protected-access + return [rule for rule in rules if isinstance(rule, types)] diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc index 13b00fa7deb6..8fcb8fe503b7 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc @@ -328,7 +328,7 @@ std::vector MultiLevelTilingTensorCoreNode::AddSoftwarePipeline( // Add local stage and double buffering for (int i = 0; i < 2; ++i) { const tir::BlockRV cache_read = state->read_reuse.at(i); - sch->Annotate(cache_read, tir::attr::manifest_shared_memory_local_stage, Bool(true)); + sch->Annotate(cache_read, tir::attr::manifest_shared_memory_local_stage, Integer(1)); sch->Annotate(cache_read, tir::attr::double_buffer_scope, Integer(0)); } @@ -536,7 +536,7 @@ inline std::vector MultiLevelTilingTensorCoreNode::TransformForTensorizat state->intrin_group.compute_intrin); state->sch->Annotate(state->block_rv, tir::attr::meta_schedule_auto_tensorize_init, state->intrin_group.init_intrin); - state->sch->Annotate(state->block_rv, tir::attr::warp_execution, Bool(true)); + state->sch->Annotate(state->block_rv, tir::attr::warp_execution, Integer(1)); return {std::move(state)}; } diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h index ad56fa7f6a52..cf9a32917031 100644 --- a/src/meta_schedule/utils.h +++ b/src/meta_schedule/utils.h @@ -77,33 +77,34 @@ class PyLogMessage { // FATAL not included }; - PyLogMessage(const std::string& file, int lineno, PackedFunc logging_func, Level logging_level) { - this->logging_func = logging_func; - this->logging_level = logging_level; - } + explicit PyLogMessage(const char* file, int lineno, PackedFunc logging_func, Level logging_level) + : file_(file), lineno_(lineno), logging_func_(logging_func), logging_level_(logging_level) {} + TVM_NO_INLINE ~PyLogMessage() { - if (this->logging_func.defined()) { - logging_func(static_cast(logging_level), stream_.str()); + if (this->logging_func_.defined()) { + logging_func_(static_cast(logging_level_), stream_.str()); } else { - if (logging_level == Level::INFO) { - LOG(INFO) << stream_.str(); - } else if (logging_level == Level::WARNING) { - LOG(WARNING) << stream_.str(); - } else if (logging_level == Level::ERROR) { - LOG(ERROR) << stream_.str(); - } else if (logging_level == Level::DEBUG) { - DLOG(INFO) << stream_.str(); + if (logging_level_ == Level::INFO) { + runtime::detail::LogMessage(file_, lineno_).stream() << stream_.str(); + } else if (logging_level_ == Level::WARNING) { + runtime::detail::LogMessage(file_, lineno_).stream() << "Warning: " << stream_.str(); + } else if (logging_level_ == Level::ERROR) { + runtime::detail::LogMessage(file_, lineno_).stream() << "Error: " << stream_.str(); + } else if (logging_level_ == Level::DEBUG) { + runtime::detail::LogMessage(file_, lineno_).stream() << "Debug: " << stream_.str(); } else { - LOG(FATAL) << stream_.str(); + runtime::detail::LogFatal(file_, lineno_).stream() << stream_.str(); } } } std::ostringstream& stream() { return stream_; } private: + const char* file_; + int lineno_; std::ostringstream stream_; - PackedFunc logging_func; - Level logging_level; + PackedFunc logging_func_; + Level logging_level_; }; /*! \brief The type of the random state */ diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py index 21ad04da473e..a50292df7ae3 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py @@ -16,6 +16,7 @@ # under the License. # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring from tvm import meta_schedule as ms +from tvm.meta_schedule.testing.schedule_rule import get_rules from tvm.meta_schedule.testing.space_generation import check_sketches from tvm.script import tir as T from tvm.target import Target @@ -83,12 +84,7 @@ def elementwise_0( mod=mod, target=Target("nvidia/geforce-rtx-3080", host="llvm"), space_generator=ms.space_generator.PostOrderApply(), - sch_rules=[ - ms.schedule_rule.AutoBind( - max_threadblocks=256, - thread_extents=[32, 64, 128, 256, 512, 1024], - ) - ], + sch_rules=get_rules("cuda", ms.schedule_rule.AutoBind), task_name="test", ).generate_design_space() check_sketches( @@ -122,12 +118,7 @@ def reduction_loop_only_0( mod=mod, target=Target("nvidia/geforce-rtx-3080", host="llvm"), space_generator=ms.space_generator.PostOrderApply(), - sch_rules=[ - ms.schedule_rule.AutoBind( - max_threadblocks=256, - thread_extents=[32, 64, 128, 256, 512, 1024], - ) - ], + sch_rules=get_rules("cuda", ms.schedule_rule.AutoBind), task_name="test", ).generate_design_space() check_sketches( @@ -158,12 +149,7 @@ def zero_dim_add_0( mod=mod, target=Target("nvidia/geforce-rtx-3080", host="llvm"), space_generator=ms.space_generator.PostOrderApply(), - sch_rules=[ - ms.schedule_rule.AutoBind( - max_threadblocks=256, - thread_extents=[32, 64, 128, 256, 512, 1024], - ) - ], + sch_rules=get_rules("cuda", ms.schedule_rule.AutoBind), task_name="test", ).generate_design_space() check_sketches( diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py index fcf6a8571b7f..c0801c9d7b5e 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py @@ -16,9 +16,8 @@ # under the License. # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring import tvm -from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply -from tvm.meta_schedule.testing.schedule_rule import auto_inline -from tvm.meta_schedule.tune_context import TuneContext +from tvm import meta_schedule as ms +from tvm.meta_schedule.testing.schedule_rule import get_rules from tvm.script import tir as T from tvm.target import Target @@ -340,10 +339,10 @@ def main(T_full: T.Buffer[(1, 12, 4096), "int64"]) -> None: def _create_context(mod, target, rule): - ctx = TuneContext( + ctx = ms.TuneContext( mod=mod, target=target, - space_generator=PostOrderApply(), + space_generator=ms.space_generator.PostOrderApply(), sch_rules=[rule], task_name="test", ) @@ -356,7 +355,7 @@ def test_inline_consumer_chain(): ctx = _create_context( mod=mod, target=target, - rule=auto_inline(target=target), + rule=get_rules("llvm", ms.schedule_rule.AutoInline)[0], ) (space,) = ctx.space_generator.generate_design_space(mod=mod) tvm.ir.assert_structural_equal(lhs=space.mod, rhs=Conv2DBiasBnReLUInlined) @@ -368,7 +367,7 @@ def test_inline_into_cache(): ctx = _create_context( mod=mod, target=target, - rule=auto_inline(target=target), + rule=get_rules("cuda", ms.schedule_rule.AutoInline)[0], ) (space,) = ctx.space_generator.generate_design_space(mod=mod) tvm.ir.assert_structural_equal(lhs=space.mod, rhs=MultiLevelTiledConv2DAfterInline) @@ -380,7 +379,7 @@ def test_inline_into_multiple_consumers(): ctx = _create_context( mod=mod, target=target, - rule=auto_inline(target=target), + rule=get_rules("cuda", ms.schedule_rule.AutoInline)[0], ) (space,) = ctx.space_generator.generate_design_space(mod=mod) tvm.ir.assert_structural_equal(lhs=space.mod, rhs=SoftmaxAfterInline) @@ -392,7 +391,7 @@ def test_inline_pure_spatial(): ctx = _create_context( mod=mod, target=target, - rule=auto_inline(target=target), + rule=get_rules("llvm", ms.schedule_rule.AutoInline)[0], ) (space,) = ctx.space_generator.generate_design_space(mod=mod) tvm.ir.assert_structural_equal(lhs=space.mod, rhs=AfterPureSpatial) @@ -404,7 +403,7 @@ def test_inline_constant_tensor(): ctx = _create_context( mod=mod, target=target, - rule=auto_inline(target=target), + rule=get_rules("cuda", ms.schedule_rule.AutoInline)[0], ) (space,) = ctx.space_generator.generate_design_space(mod=mod) tvm.ir.assert_structural_equal(lhs=space.mod, rhs=ConstConsumer) diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py index ab8df6678b0b..4278638a1aa3 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py @@ -19,6 +19,7 @@ import tvm from tvm import meta_schedule as ms from tvm.meta_schedule.testing import te_workload +from tvm.meta_schedule.testing.schedule_rule import get_rules from tvm.meta_schedule.testing.space_generation import check_sketches from tvm.script import tir as T from tvm.target import Target @@ -283,9 +284,7 @@ def softmax_mn_3( mod=mod, target=Target("nvidia/geforce-rtx-3090", host="llvm"), space_generator=ms.space_generator.PostOrderApply(), - sch_rules=[ - ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]) - ], + sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction), task_name="test", ).generate_design_space() check_sketches( @@ -481,9 +480,7 @@ def softmax_mn_after_inline_3( mod=mod, target=Target("nvidia/geforce-rtx-3090", host="llvm"), space_generator=ms.space_generator.PostOrderApply(), - sch_rules=[ - ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]) - ], + sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction), task_name="test", ).generate_design_space() check_sketches( @@ -559,9 +556,7 @@ def batch_norm_bmn_1(A: T.Buffer[(1, 512, 512), "float32"], D: T.Buffer[1, "floa mod=mod, target=Target("nvidia/geforce-rtx-3090", host="llvm"), space_generator=ms.space_generator.PostOrderApply(), - sch_rules=[ - ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]) - ], + sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction), task_name="test", ).generate_design_space() check_sketches( @@ -657,9 +652,7 @@ def argmax_1( mod=mod, target=Target("nvidia/geforce-rtx-3090", host="llvm"), space_generator=ms.space_generator.PostOrderApply(), - sch_rules=[ - ms.schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]) - ], + sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction), task_name="test", ).generate_design_space() check_sketches( diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py new file mode 100644 index 000000000000..939ccbe54fa6 --- /dev/null +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py @@ -0,0 +1,529 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring +from tvm import meta_schedule as ms +from tvm import te +from tvm.meta_schedule.testing import te_workload +from tvm.meta_schedule.testing.schedule_rule import get_rules +from tvm.meta_schedule.testing.space_generation import check_sketches +from tvm.script import tir as T +from tvm.target import Target + + +def test_cpu_matmul(): + @T.prim_func + def cpu_matmul_0( + A: T.Buffer[(512, 512), "float32"], + B: T.Buffer[(512, 512), "float32"], + C: T.Buffer[(512, 512), "float32"], + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C_global = T.alloc_buffer([512, 512], dtype="float32") + for i0_0, i1_0, i0_1, i1_1 in T.grid(1, 8, 8, 1): + for i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid(16, 2, 8, 32, 32, 8): + with T.block("C"): + i = T.axis.spatial(512, i0_0 * 512 + i0_1 * 64 + i0_2 * 32 + i0_3) + j = T.axis.spatial(512, i1_0 * 64 + i1_1 * 64 + i1_2 * 8 + i1_3) + k = T.axis.reduce(512, i2_0 * 32 + i2_1) + T.reads(A[i, k], B[k, j]) + T.writes(C_global[i, j]) + T.block_attr({"meta_schedule.tiling_structure": "SSRSRS"}) + with T.init(): + C_global[i, j] = T.float32(0) + C_global[i, j] = C_global[i, j] + A[i, k] * B[k, j] + for ax0, ax1 in T.grid(64, 64): + with T.block("C_global"): + v0 = T.axis.spatial(512, i0_1 * 64 + ax0) + v1 = T.axis.spatial(512, i1_0 * 64 + ax1) + T.reads(C_global[v0, v1]) + T.writes(C[v0, v1]) + C[v0, v1] = C_global[v0, v1] + + @T.prim_func + def cpu_matmul_1( + A: T.Buffer[(512, 512), "float32"], + B: T.Buffer[(512, 512), "float32"], + C: T.Buffer[(512, 512), "float32"], + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C_global = T.alloc_buffer([512, 512], dtype="float32") + for i0_0, i1_0 in T.grid(1, 8): + for i0_1, i1_1, i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid(8, 1, 16, 2, 8, 32, 32, 8): + with T.block("C"): + i = T.axis.spatial(512, i0_0 * 512 + i0_1 * 64 + i0_2 * 32 + i0_3) + j = T.axis.spatial(512, i1_0 * 64 + i1_1 * 64 + i1_2 * 8 + i1_3) + k = T.axis.reduce(512, i2_0 * 32 + i2_1) + T.reads(A[i, k], B[k, j]) + T.writes(C_global[i, j]) + T.block_attr({"meta_schedule.tiling_structure": "SSRSRS"}) + with T.init(): + C_global[i, j] = T.float32(0) + C_global[i, j] = C_global[i, j] + A[i, k] * B[k, j] + for ax0, ax1 in T.grid(512, 64): + with T.block("C_global"): + v0 = T.axis.spatial(512, ax0) + v1 = T.axis.spatial(512, i1_0 * 64 + ax1) + T.reads(C_global[v0, v1]) + T.writes(C[v0, v1]) + C[v0, v1] = C_global[v0, v1] + + @T.prim_func + def cpu_matmul_2( + A: T.Buffer[(512, 512), "float32"], + B: T.Buffer[(512, 512), "float32"], + C: T.Buffer[(512, 512), "float32"], + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + for i0_0, i1_0, i0_1, i1_1, i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid( + 1, 8, 8, 1, 16, 2, 8, 32, 32, 8 + ): + with T.block("C"): + i = T.axis.spatial(512, i0_0 * 512 + i0_1 * 64 + i0_2 * 32 + i0_3) + j = T.axis.spatial(512, i1_0 * 64 + i1_1 * 64 + i1_2 * 8 + i1_3) + k = T.axis.reduce(512, i2_0 * 32 + i2_1) + T.reads(A[i, k], B[k, j]) + T.writes(C[i, j]) + T.block_attr({"meta_schedule.tiling_structure": "SSRSRS"}) + with T.init(): + C[i, j] = T.float32(0) + C[i, j] = C[i, j] + A[i, k] * B[k, j] + + decision_0 = [ + ("SamplePerfectTile", [1, 8, 2, 32]), + ("SamplePerfectTile", [8, 1, 8, 8]), + ("SamplePerfectTile", [16, 32]), + ] + decision_1 = [ + ("SamplePerfectTile", [1, 8, 2, 32]), + ("SamplePerfectTile", [8, 1, 8, 8]), + ("SamplePerfectTile", [16, 32]), + ] + decision_2 = [ + ("SamplePerfectTile", [1, 8, 2, 32]), + ("SamplePerfectTile", [8, 1, 8, 8]), + ("SamplePerfectTile", [16, 32]), + ] + + mod = te.create_prim_func(te_workload.matmul(512, 512, 512)) + actual = ms.TuneContext( + mod=mod, + target=Target("llvm"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=get_rules("llvm", ms.schedule_rule.MultiLevelTiling), + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[cpu_matmul_0, cpu_matmul_1, cpu_matmul_2], + expected_decisions=[decision_0, decision_1, decision_2], + ) + + +def test_cpu_matmul_relu(): + @T.prim_func + def cpu_matmul_relu_0( + A: T.Buffer[(512, 512), "float32"], + B: T.Buffer[(512, 512), "float32"], + compute: T.Buffer[(512, 512), "float32"], + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C = T.alloc_buffer([512, 512], dtype="float32") + for i0_0, i1_0, i0_1, i1_1, i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid( + 256, 4, 1, 4, 64, 1, 32, 8, 2, 1 + ): + with T.block("C"): + i = T.axis.spatial(512, i0_0 * 2 + i0_1 * 2 + i0_2 * 2 + i0_3) + j = T.axis.spatial(512, i1_0 * 128 + i1_1 * 32 + i1_2 + i1_3) + k = T.axis.reduce(512, i2_0 * 8 + i2_1) + T.reads(A[i, k], B[k, j]) + T.writes(C[i, j]) + T.block_attr({"meta_schedule.tiling_structure": "SSRSRS"}) + with T.init(): + C[i, j] = T.float32(0) + C[i, j] = C[i, j] + A[i, k] * B[k, j] + for i0, i1 in T.grid(512, 512): + with T.block("compute"): + i0_4, i1_4 = T.axis.remap("SS", [i0, i1]) + T.reads(C[i0_4, i1_4]) + T.writes(compute[i0_4, i1_4]) + compute[i0_4, i1_4] = T.max(C[i0_4, i1_4], T.float32(0)) + + @T.prim_func + def cpu_matmul_relu_1( + A: T.Buffer[(512, 512), "float32"], + B: T.Buffer[(512, 512), "float32"], + compute: T.Buffer[(512, 512), "float32"], + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C = T.alloc_buffer([512, 512], dtype="float32") + for i0_0, i1_0, i0_1, i1_1 in T.grid(256, 4, 1, 4): + for i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid(64, 1, 32, 8, 2, 1): + with T.block("C"): + i = T.axis.spatial(512, i0_0 * 2 + i0_1 * 2 + i0_2 * 2 + i0_3) + j = T.axis.spatial(512, i1_0 * 128 + i1_1 * 32 + i1_2 + i1_3) + k = T.axis.reduce(512, i2_0 * 8 + i2_1) + T.reads(A[i, k], B[k, j]) + T.writes(C[i, j]) + T.block_attr({"meta_schedule.tiling_structure": "SSRSRS"}) + with T.init(): + C[i, j] = T.float32(0) + C[i, j] = C[i, j] + A[i, k] * B[k, j] + for ax0, ax1 in T.grid(2, 32): + with T.block("compute"): + i0 = T.axis.spatial(512, i0_0 * 2 + ax0) + i1 = T.axis.spatial(512, i1_0 * 128 + i1_1 * 32 + ax1) + T.reads(C[i0, i1]) + T.writes(compute[i0, i1]) + compute[i0, i1] = T.max(C[i0, i1], T.float32(0)) + + @T.prim_func + def cpu_matmul_relu_2( + A: T.Buffer[(512, 512), "float32"], + B: T.Buffer[(512, 512), "float32"], + compute: T.Buffer[(512, 512), "float32"], + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C = T.alloc_buffer([512, 512], dtype="float32") + for i0_0, i1_0 in T.grid(256, 4): + for i0_1, i1_1, i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid(1, 4, 64, 1, 32, 8, 2, 1): + with T.block("C"): + i = T.axis.spatial(512, i0_0 * 2 + i0_1 * 2 + i0_2 * 2 + i0_3) + j = T.axis.spatial(512, i1_0 * 128 + i1_1 * 32 + i1_2 + i1_3) + k = T.axis.reduce(512, i2_0 * 8 + i2_1) + T.reads(A[i, k], B[k, j]) + T.writes(C[i, j]) + T.block_attr({"meta_schedule.tiling_structure": "SSRSRS"}) + with T.init(): + C[i, j] = T.float32(0) + C[i, j] = C[i, j] + A[i, k] * B[k, j] + for ax0, ax1 in T.grid(2, 128): + with T.block("compute"): + i0 = T.axis.spatial(512, i0_0 * 2 + ax0) + i1 = T.axis.spatial(512, i1_0 * 128 + ax1) + T.reads(C[i0, i1]) + T.writes(compute[i0, i1]) + compute[i0, i1] = T.max(C[i0, i1], T.float32(0)) + + decision_0 = [ + ("SamplePerfectTile", [256, 1, 1, 2]), + ("SamplePerfectTile", [4, 4, 32, 1]), + ("SamplePerfectTile", [64, 8]), + ] + decision_1 = [ + ("SamplePerfectTile", [256, 1, 1, 2]), + ("SamplePerfectTile", [4, 4, 32, 1]), + ("SamplePerfectTile", [64, 8]), + ] + decision_2 = [ + ("SamplePerfectTile", [256, 1, 1, 2]), + ("SamplePerfectTile", [4, 4, 32, 1]), + ("SamplePerfectTile", [64, 8]), + ] + mod = te.create_prim_func(te_workload.matmul_relu(512, 512, 512)) + actual = ms.TuneContext( + mod=mod, + target=Target("llvm"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=get_rules("llvm", ms.schedule_rule.MultiLevelTiling), + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[cpu_matmul_relu_0, cpu_matmul_relu_1, cpu_matmul_relu_2], + expected_decisions=[decision_0, decision_1, decision_2], + ) + + +def test_cuda_matmul(): + @T.prim_func + def cuda_matmul_0( + A: T.Buffer[(512, 512), "float32"], + B: T.Buffer[(512, 512), "float32"], + C: T.Buffer[(512, 512), "float32"], + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C_local = T.alloc_buffer([512, 512], dtype="float32", scope="local") + A_shared = T.alloc_buffer([512, 512], dtype="float32", scope="shared") + B_shared = T.alloc_buffer([512, 512], dtype="float32", scope="shared") + for i0_0_i1_0_fused in T.thread_binding(128, thread="blockIdx.x"): + for i0_1_i1_1_fused in T.thread_binding(8, thread="vthread.x"): + for i0_2_i1_2_fused in T.thread_binding(4, thread="threadIdx.x"): + for i2_0 in T.serial(128): + for ax0_ax1_fused in T.serial(256): + with T.block("A_shared"): + v0 = T.axis.spatial( + 512, i0_0_i1_0_fused // 16 * 64 + ax0_ax1_fused // 4 + ) + v1 = T.axis.spatial(512, i2_0 * 4 + ax0_ax1_fused % 4) + T.reads(A[v0, v1]) + T.writes(A_shared[v0, v1]) + T.block_attr({"meta_schedule.cooperative_fetch": 2}) + A_shared[v0, v1] = A[v0, v1] + for ax0_ax1_fused in T.serial(128): + with T.block("B_shared"): + v0 = T.axis.spatial(512, i2_0 * 4 + ax0_ax1_fused // 32) + v1 = T.axis.spatial( + 512, i0_0_i1_0_fused % 16 * 32 + ax0_ax1_fused % 32 + ) + T.reads(B[v0, v1]) + T.writes(B_shared[v0, v1]) + T.block_attr({"meta_schedule.cooperative_fetch": 1}) + B_shared[v0, v1] = B[v0, v1] + for i2_1, i0_3, i1_3, i2_2, i0_4, i1_4 in T.grid(2, 1, 1, 2, 16, 4): + with T.block("C"): + i = T.axis.spatial( + 512, + i0_0_i1_0_fused // 16 * 64 + + i0_1_i1_1_fused // 2 * 16 + + i0_3 * 16 + + i0_4, + ) + j = T.axis.spatial( + 512, + i0_0_i1_0_fused % 16 * 32 + + i0_1_i1_1_fused % 2 * 16 + + i0_2_i1_2_fused * 4 + + i1_3 * 4 + + i1_4, + ) + k = T.axis.reduce(512, i2_0 * 4 + i2_1 * 2 + i2_2) + T.reads(A_shared[i, k], B_shared[k, j]) + T.writes(C_local[i, j]) + T.block_attr( + { + "meta_schedule.thread_extent_high_inclusive": 1024, + "meta_schedule.thread_extent_low_inclusive": 32, + "meta_schedule.tiling_structure": "SSSRRSRS", + } + ) + with T.init(): + C_local[i, j] = T.float32(0) + C_local[i, j] = C_local[i, j] + A_shared[i, k] * B_shared[k, j] + for ax0, ax1 in T.grid(16, 4): + with T.block("C_local"): + v0 = T.axis.spatial( + 512, i0_0_i1_0_fused // 16 * 64 + i0_1_i1_1_fused // 2 * 16 + ax0 + ) + v1 = T.axis.spatial( + 512, + i0_0_i1_0_fused % 16 * 32 + + i0_1_i1_1_fused % 2 * 16 + + i0_2_i1_2_fused * 4 + + ax1, + ) + T.reads(C_local[v0, v1]) + T.writes(C[v0, v1]) + C[v0, v1] = C_local[v0, v1] + + decision_0 = [ + ("SamplePerfectTile", [8, 4, 1, 1, 16]), + ("SamplePerfectTile", [16, 2, 4, 1, 4]), + ("SamplePerfectTile", [128, 2, 2]), + ("SampleCategorical", 1), + ("SampleCategorical", 0), + ] + mod = te.create_prim_func(te_workload.matmul(512, 512, 512)) + actual = ms.TuneContext( + mod=mod, + target=Target("nvidia/geforce-rtx-3080"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=get_rules("cuda", ms.schedule_rule.MultiLevelTiling), + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[cuda_matmul_0], + expected_decisions=[decision_0], + ) + + +def test_cuda_matmul_relu(): + @T.prim_func + def cuda_matmul_relu_0( + A: T.Buffer[(512, 512), "float32"], + B: T.Buffer[(512, 512), "float32"], + compute: T.Buffer[(512, 512), "float32"], + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C = T.alloc_buffer([512, 512], dtype="float32") + C_local = T.alloc_buffer([512, 512], dtype="float32", scope="local") + A_shared = T.alloc_buffer([512, 512], dtype="float32", scope="shared") + B_shared = T.alloc_buffer([512, 512], dtype="float32", scope="shared") + for i0_0_i1_0_fused in T.thread_binding(64, thread="blockIdx.x"): + for i0_1_i1_1_fused in T.thread_binding(64, thread="vthread.x"): + for i0_2_i1_2_fused in T.thread_binding(8, thread="threadIdx.x"): + for i2_0 in T.serial(8): + for ax0_ax1_fused in T.serial(4096): + with T.block("A_shared"): + v0 = T.axis.spatial( + 512, i0_0_i1_0_fused // 8 * 64 + ax0_ax1_fused // 64 + ) + v1 = T.axis.spatial(512, i2_0 * 64 + ax0_ax1_fused % 64) + T.reads(A[v0, v1]) + T.writes(A_shared[v0, v1]) + T.block_attr({"meta_schedule.cooperative_fetch": 2}) + A_shared[v0, v1] = A[v0, v1] + for ax0_ax1_fused in T.serial(4096): + with T.block("B_shared"): + v0 = T.axis.spatial(512, i2_0 * 64 + ax0_ax1_fused // 64) + v1 = T.axis.spatial( + 512, i0_0_i1_0_fused % 8 * 64 + ax0_ax1_fused % 64 + ) + T.reads(B[v0, v1]) + T.writes(B_shared[v0, v1]) + T.block_attr({"meta_schedule.cooperative_fetch": 4}) + B_shared[v0, v1] = B[v0, v1] + for i2_1, i0_3, i1_3, i2_2, i0_4, i1_4 in T.grid(8, 2, 1, 8, 2, 2): + with T.block("C"): + i = T.axis.spatial( + 512, + i0_0_i1_0_fused // 8 * 64 + + i0_1_i1_1_fused // 8 * 8 + + i0_2_i1_2_fused // 4 * 4 + + i0_3 * 2 + + i0_4, + ) + j = T.axis.spatial( + 512, + i0_0_i1_0_fused % 8 * 64 + + i0_1_i1_1_fused % 8 * 8 + + i0_2_i1_2_fused % 4 * 2 + + i1_3 * 2 + + i1_4, + ) + k = T.axis.reduce(512, i2_0 * 64 + i2_1 * 8 + i2_2) + T.reads(A_shared[i, k], B_shared[k, j]) + T.writes(C_local[i, j]) + T.block_attr( + { + "meta_schedule.thread_extent_high_inclusive": 1024, + "meta_schedule.thread_extent_low_inclusive": 32, + "meta_schedule.tiling_structure": "SSSRRSRS", + } + ) + with T.init(): + C_local[i, j] = T.float32(0) + C_local[i, j] = C_local[i, j] + A_shared[i, k] * B_shared[k, j] + for ax0, ax1 in T.grid(4, 2): + with T.block("C_local"): + v0 = T.axis.spatial( + 512, + i0_0_i1_0_fused // 8 * 64 + + i0_1_i1_1_fused // 8 * 8 + + i0_2_i1_2_fused // 4 * 4 + + ax0, + ) + v1 = T.axis.spatial( + 512, + i0_0_i1_0_fused % 8 * 64 + + i0_1_i1_1_fused % 8 * 8 + + i0_2_i1_2_fused % 4 * 2 + + ax1, + ) + T.reads(C_local[v0, v1]) + T.writes(C[v0, v1]) + C[v0, v1] = C_local[v0, v1] + for i0, i1 in T.grid(512, 512): + with T.block("compute"): + i0_1, i1_1 = T.axis.remap("SS", [i0, i1]) + T.reads(C[i0_1, i1_1]) + T.writes(compute[i0_1, i1_1]) + compute[i0_1, i1_1] = T.max(C[i0_1, i1_1], T.float32(0)) + + decision_0 = [ + ("SamplePerfectTile", [8, 8, 2, 2, 2]), + ("SamplePerfectTile", [8, 8, 4, 1, 2]), + ("SamplePerfectTile", [8, 8, 8]), + ("SampleCategorical", 1), + ("SampleCategorical", 3), + ] + mod = te.create_prim_func(te_workload.matmul_relu(512, 512, 512)) + actual = ms.TuneContext( + mod=mod, + target=Target("nvidia/geforce-rtx-3080"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=get_rules("cuda", ms.schedule_rule.MultiLevelTiling), + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[cuda_matmul_relu_0], + expected_decisions=[decision_0], + ) + + +def test_cuda_sum_with_trivial_block_iter(): + @T.prim_func + def sum_with_trivial_block_iter( + A: T.Buffer[(1, 64, 768), "float32"], + B: T.Buffer[(1, 64, 1), "float32"], + ) -> None: + for i0, i1, i2, i3 in T.grid(1, 64, 1, 768): + with T.block("sum"): + ax0, ax1, ax2, k2 = T.axis.remap("SSSR", [i0, i1, i2, i3]) + T.reads(A[ax0, ax1, k2]) + T.writes(B[ax0, ax1, ax2]) + with T.init(): + B[ax0, ax1, ax2] = T.float32(0) + B[ax0, ax1, ax2] = B[ax0, ax1, ax2] + A[ax0, ax1, k2] + + # Expect nothing to happen - the rule is not supposed to be applied in this case + mod = sum_with_trivial_block_iter + (sch,) = ms.TuneContext( + mod=mod, + target=Target("nvidia/geforce-rtx-3080"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=get_rules("cuda", ms.schedule_rule.MultiLevelTiling), + task_name="test", + ).generate_design_space() + assert not sch.trace.simplified(remove_postproc=True).insts + + +if __name__ == "__main__": + test_cpu_matmul() + test_cpu_matmul_relu() + test_cuda_matmul() + test_cuda_matmul_relu() + test_cuda_sum_with_trivial_block_iter() diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py new file mode 100644 index 000000000000..38ddb137e108 --- /dev/null +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py @@ -0,0 +1,418 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring +from tvm import meta_schedule as ms +from tvm import te +from tvm.ir import assert_structural_equal +from tvm.meta_schedule.testing.space_generation import check_sketches +from tvm.script import tir as T +from tvm.target import Target +from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN +from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN + + +def test_vnni_conv2d_nchwc(): + @T.prim_func + def conv2d_nchwc( + placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], + placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], + conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"], + ) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4): + with T.block("conv2d_NCHWc_int8"): + ( + n, + oc_chunk, + oh, + ow, + oc_block, + kh, + kw, + ic_outer, + ic_f_inner, + ic_s_inner, + ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9]) + T.reads( + placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], + placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner], + ) + T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block]) + with T.init(): + conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0 + conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[ + n, oc_chunk, oh, ow, oc_block + ] + T.cast( + placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32" + ) * T.cast( + placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner], + "int32", + ) + + # fmt: off + @T.prim_func + def vnni_conv2d_nchwc_0(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + conv2d_NCHWc_int8_global = T.alloc_buffer([1, 16, 56, 56, 16], dtype="int32") + for i0_0, i1_0, i2_0, i3_0, i4_0_0, i0_1, i1_1, i2_1, i3_1, i4_0_1 in T.grid(1, 8, 28, 56, 1, 1, 2, 1, 1, 1): + for i5_0, i6_0, i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(1, 1, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1): + with T.block("conv2d_NCHWc_int8_o"): + n = T.axis.spatial(1, 0) + oc_chunk = T.axis.spatial(16, i1_0 * 2 + i1_1 + i1_2 + i1_3) + oh = T.axis.spatial(56, i2_0 * 2 + i2_1 * 2 + i2_2 + i2_3) + ow = T.axis.spatial(56, i3_3 + i3_0 + i3_1 + i3_2) + oc_block_o = T.axis.spatial(1, 0) + kh = T.axis.reduce(1, 0) + kw = T.axis.reduce(1, 0) + ic_outer = T.axis.reduce(4, i7_0 * 4 + i7_1) + ic_f_inner = T.axis.reduce(4, i8_0 + i8_1) + ic_s_inner_o = T.axis.reduce(1, 0) + T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4]) + T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, 0 : 16]) + T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"}) + with T.init(): + for i4_1 in T.serial(16): + with T.block("conv2d_NCHWc_int8_init"): + oc_block_i_init = T.axis.spatial(16, i4_1) + T.reads() + T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i_init]) + conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i_init] = 0 + for i4_1, i9_1 in T.grid(16, 4): + with T.block("conv2d_NCHWc_int8"): + oc_block_i, ic_s_inner_i = T.axis.remap("SR", [i4_1, i9_1]) + T.reads(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i], placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner_i], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block_i, ic_s_inner_i]) + T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i] = conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i] + T.cast(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner_i], "int32") * T.cast(placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block_i, ic_s_inner_i], "int32") + for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 1, 2, 1, 16): + with T.block("conv2d_NCHWc_int8_global"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(16, i1_0 * 2 + i1_1 + ax1) + v2 = T.axis.spatial(56, i2_0 * 2 + ax2) + v3 = T.axis.spatial(56, i3_0 + ax3) + v4 = T.axis.spatial(16, ax4) + T.reads(conv2d_NCHWc_int8_global[v0, v1, v2, v3, v4]) + T.writes(conv2d_NCHWc_int8[v0, v1, v2, v3, v4]) + conv2d_NCHWc_int8[v0, v1, v2, v3, v4] = conv2d_NCHWc_int8_global[v0, v1, v2, v3, v4] + + @T.prim_func + def vnni_conv2d_nchwc_1(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + conv2d_NCHWc_int8_global = T.alloc_buffer([1, 16, 56, 56, 16], dtype="int32") + for i0_0, i1_0, i2_0, i3_0, i4_0_0 in T.grid(1, 8, 28, 56, 1): + for i0_1, i1_1, i2_1, i3_1, i4_0_1, i5_0, i6_0, i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(1, 2, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1): + with T.block("conv2d_NCHWc_int8_o"): + n = T.axis.spatial(1, 0) + oc_chunk = T.axis.spatial(16, i1_0 * 2 + i1_1 + i1_2 + i1_3) + oh = T.axis.spatial(56, i2_0 * 2 + i2_1 * 2 + i2_2 + i2_3) + ow = T.axis.spatial(56, i3_3 + i3_0 + i3_1 + i3_2) + oc_block_o = T.axis.spatial(1, 0) + kh = T.axis.reduce(1, 0) + kw = T.axis.reduce(1, 0) + ic_outer = T.axis.reduce(4, i7_0 * 4 + i7_1) + ic_f_inner = T.axis.reduce(4, i8_0 + i8_1) + ic_s_inner_o = T.axis.reduce(1, 0) + T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4]) + T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, 0 : 16]) + T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"}) + with T.init(): + for i4_1 in T.serial(16): + with T.block("conv2d_NCHWc_int8_init"): + oc_block_i_init = T.axis.spatial(16, i4_1) + T.reads() + T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i_init]) + conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i_init] = 0 + for i4_1, i9_1 in T.grid(16, 4): + with T.block("conv2d_NCHWc_int8"): + oc_block_i, ic_s_inner_i = T.axis.remap("SR", [i4_1, i9_1]) + T.reads(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i], placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner_i], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block_i, ic_s_inner_i]) + T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i] = conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, oc_block_i] + T.cast(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner_i], "int32") * T.cast(placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block_i, ic_s_inner_i], "int32") + for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 2, 2, 1, 16): + with T.block("conv2d_NCHWc_int8_global"): + v0 = T.axis.spatial(1, ax0) + v1 = T.axis.spatial(16, i1_0 * 2 + ax1) + v2 = T.axis.spatial(56, i2_0 * 2 + ax2) + v3 = T.axis.spatial(56, i3_0 + ax3) + v4 = T.axis.spatial(16, ax4) + T.reads(conv2d_NCHWc_int8_global[v0, v1, v2, v3, v4]) + T.writes(conv2d_NCHWc_int8[v0, v1, v2, v3, v4]) + conv2d_NCHWc_int8[v0, v1, v2, v3, v4] = conv2d_NCHWc_int8_global[v0, v1, v2, v3, v4] + + @T.prim_func + def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + for i0_0, i1_0, i2_0, i3_0, i4_0_0, i0_1, i1_1, i2_1, i3_1, i4_0_1, i5_0, i6_0, i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(1, 8, 28, 56, 1, 1, 2, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1): + with T.block("conv2d_NCHWc_int8_o"): + n = T.axis.spatial(1, 0) + oc_chunk = T.axis.spatial(16, i1_0 * 2 + i1_1 + i1_2 + i1_3) + oh = T.axis.spatial(56, i2_0 * 2 + i2_1 * 2 + i2_2 + i2_3) + ow = T.axis.spatial(56, i3_3 + i3_0 + i3_1 + i3_2) + oc_block_o = T.axis.spatial(1, 0) + kh = T.axis.reduce(1, 0) + kw = T.axis.reduce(1, 0) + ic_outer = T.axis.reduce(4, i7_0 * 4 + i7_1) + ic_f_inner = T.axis.reduce(4, i8_0 + i8_1) + ic_s_inner_o = T.axis.reduce(1, 0) + T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4]) + T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16]) + T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"}) + with T.init(): + for i4_1 in T.serial(16): + with T.block("conv2d_NCHWc_int8_init"): + oc_block_i_init = T.axis.spatial(16, i4_1) + T.reads() + T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init]) + conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0 + for i4_1, i9_1 in T.grid(16, 4): + with T.block("conv2d_NCHWc_int8"): + oc_block_i, ic_s_inner_i = T.axis.remap("SR", [i4_1, i9_1]) + T.reads(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i], placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner_i], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block_i, ic_s_inner_i]) + T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i]) + T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"}) + conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i] = conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i] + T.cast(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner_i], "int32") * T.cast(placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block_i, ic_s_inner_i], "int32") + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [8, 2, 1, 1]), + ("SamplePerfectTile", [28, 1, 2, 1]), + ("SamplePerfectTile", [56, 1, 1, 1]), + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [1, 1]), + ("SamplePerfectTile", [1, 1]), + ("SamplePerfectTile", [1, 4]), + ("SamplePerfectTile", [4, 1]), + ("SamplePerfectTile", [1, 1]), + ] + decision_1 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [8, 2, 1, 1]), + ("SamplePerfectTile", [28, 1, 2, 1]), + ("SamplePerfectTile", [56, 1, 1, 1]), + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [1, 1]), + ("SamplePerfectTile", [1, 1]), + ("SamplePerfectTile", [1, 4]), + ("SamplePerfectTile", [4, 1]), + ("SamplePerfectTile", [1, 1]), + ] + decision_2 = [ + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [8, 2, 1, 1]), + ("SamplePerfectTile", [28, 1, 2, 1]), + ("SamplePerfectTile", [56, 1, 1, 1]), + ("SamplePerfectTile", [1, 1, 1, 1]), + ("SamplePerfectTile", [1, 1]), + ("SamplePerfectTile", [1, 1]), + ("SamplePerfectTile", [1, 4]), + ("SamplePerfectTile", [4, 1]), + ("SamplePerfectTile", [1, 1]), + ] + + mod = conv2d_nchwc + target = Target("llvm -mcpu=cascadelake -num-cores=4") + actual = ms.TuneContext( + mod=mod, + target=Target(target), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + ms.schedule_rule.MultiLevelTilingWithIntrin( + VNNI_INTRIN, + structure="SSRSRS", + tile_binds=None, + max_innermost_factor=64, + vector_load_lens=None, + reuse_read=None, + reuse_write=ms.schedule_rule.ReuseType(req="may", levels=[1, 2], scope="global"), + ), + ], + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[vnni_conv2d_nchwc_0, vnni_conv2d_nchwc_1, vnni_conv2d_nchwc_2], + expected_decisions=[decision_0, decision_1, decision_2], + ) + + +def _check_dp4a_dense(m, n, k, in_dtype, out_dtype, expected_mods, expected_decisions): + def _dense(m, n, k, in_dtype, out_dtype): + X = te.placeholder((m, k), name="X", dtype=in_dtype) + W = te.placeholder((n, k), name="W", dtype=in_dtype) + ak = te.reduce_axis((0, k), name="k") + matmul = te.compute( + (m, n), + lambda i, j: te.sum( + X[i, ak].astype(out_dtype) * W[j, ak].astype(out_dtype), + axis=ak, + ), + name="compute", + ) + return te.create_prim_func([X, W, matmul]) + + mod = _dense(m, n, k, in_dtype, out_dtype) + actual = ms.TuneContext( + mod=mod, + target=Target("cuda"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + ms.schedule_rule.MultiLevelTilingWithIntrin( + DP4A_INTRIN, + structure="SSSRRSRS", + tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"], + max_innermost_factor=64, + vector_load_lens=[1, 2, 3, 4], + reuse_read=ms.schedule_rule.ReuseType(req="must", levels=[4], scope="shared"), + reuse_write=ms.schedule_rule.ReuseType(req="must", levels=[3], scope="local"), + ) + ], + ).generate_design_space() + if expected_mods is None: + assert expected_decisions is None + assert len(actual) == 1 + assert_structural_equal(mod, actual[0].mod["main"]) + else: + check_sketches(mod, actual, expected_mods, expected_decisions) + + +def test_dp4a_dense(): + @T.prim_func + def dp4a_dense_0( + X: T.Buffer[(128, 128), "int8"], + W: T.Buffer[(128, 128), "int8"], + compute: T.Buffer[(128, 128), "int32"], + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + compute_local = T.alloc_buffer([128, 128], dtype="int32", scope="local") + X_shared = T.alloc_buffer([128, 128], dtype="int8", scope="shared") + W_shared = T.alloc_buffer([128, 128], dtype="int8", scope="shared") + for i0_0_i1_0_fused in T.thread_binding(1, thread="blockIdx.x"): + for i0_1_i1_1_fused in T.thread_binding(512, thread="vthread.x"): + for i0_2_i1_2_fused in T.thread_binding(2, thread="threadIdx.x"): + for i2_0_0 in T.serial(1): + for ax0_ax1_fused in T.serial(16384): + with T.block("X_shared"): + v0 = T.axis.spatial(128, ax0_ax1_fused // 128) + v1 = T.axis.spatial(128, ax0_ax1_fused % 128) + T.reads(X[v0, v1]) + T.writes(X_shared[v0, v1]) + T.block_attr({"meta_schedule.cooperative_fetch": 1}) + X_shared[v0, v1] = X[v0, v1] + for ax0_ax1_fused in T.serial(16384): + with T.block("W_shared"): + v0 = T.axis.spatial(128, ax0_ax1_fused // 128) + v1 = T.axis.spatial(128, ax0_ax1_fused % 128) + T.reads(W[v0, v1]) + T.writes(W_shared[v0, v1]) + T.block_attr({"meta_schedule.cooperative_fetch": 1}) + W_shared[v0, v1] = W[v0, v1] + for i2_0_1, i0_3, i1_3, i2_0_2, i0_4, i1_4 in T.grid(1, 2, 4, 32, 2, 1): + with T.block("compute_o"): + i = T.axis.spatial( + 128, + i0_1_i1_1_fused // 32 * 8 + + i0_2_i1_2_fused * 4 + + i0_3 * 2 + + i0_4, + ) + j = T.axis.spatial(128, i1_4 + i0_1_i1_1_fused % 32 * 4 + i1_3) + k_o = T.axis.reduce(32, i2_0_0 * 32 + i2_0_1 * 32 + i2_0_2) + T.reads( + X_shared[i, k_o * 4 : k_o * 4 + 4], + W_shared[j, k_o * 4 : k_o * 4 + 4], + ) + T.writes(compute_local[i, j]) + T.block_attr({"meta_schedule.auto_tensorize": "dp4a"}) + with T.init(): + with T.block("compute_init"): + T.reads() + T.writes(compute_local[i, j]) + compute_local[i, j] = 0 + for i2_1 in T.serial(4): + with T.block("compute"): + k_i = T.axis.reduce(4, i2_1) + T.reads( + compute_local[i, j], + X_shared[i, k_o * 4 + k_i], + W_shared[j, k_o * 4 + k_i], + ) + T.writes(compute_local[i, j]) + T.block_attr({"meta_schedule.tiling_structure": "SSSRRSRS"}) + compute_local[i, j] = compute_local[i, j] + T.cast( + X_shared[i, k_o * 4 + k_i], "int32" + ) * T.cast(W_shared[j, k_o * 4 + k_i], "int32") + for ax0, ax1 in T.grid(4, 4): + with T.block("compute_local"): + v0 = T.axis.spatial( + 128, i0_1_i1_1_fused // 32 * 8 + i0_2_i1_2_fused * 4 + ax0 + ) + v1 = T.axis.spatial(128, i0_1_i1_1_fused % 32 * 4 + ax1) + T.reads(compute_local[v0, v1]) + T.writes(compute[v0, v1]) + compute[v0, v1] = compute_local[v0, v1] + + decision_0 = [ + ("SamplePerfectTile", [1, 16, 2, 2, 2]), + ("SamplePerfectTile", [1, 32, 1, 4, 1]), + ("SamplePerfectTile", [1, 1, 32]), + ("SampleCategorical", 0), + ("SampleCategorical", 0), + ] + _check_dp4a_dense( + m=128, + n=128, + k=128, + in_dtype="int8", + out_dtype="int32", + expected_mods=[dp4a_dense_0], + expected_decisions=[decision_0], + ) + + +def test_dp4a_dense_no_tensorize_1(): + _check_dp4a_dense( + m=128, + n=128, + k=128, + in_dtype="float32", + out_dtype="float32", + expected_mods=None, + expected_decisions=None, + ) + + +def test_dp4a_dense_no_tensorize_2(): + _check_dp4a_dense( + m=127, + n=127, + k=127, + in_dtype="int8", + out_dtype="int32", + expected_mods=None, + expected_decisions=None, + ) + + +if __name__ == "__main__": + test_vnni_conv2d_nchwc() + test_dp4a_dense() + test_dp4a_dense_no_tensorize_1() + test_dp4a_dense_no_tensorize_2() diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py new file mode 100644 index 000000000000..fbb74090b1e5 --- /dev/null +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py @@ -0,0 +1,957 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring +import tvm +from tvm import meta_schedule as ms +from tvm import te +from tvm.meta_schedule.testing import te_workload +from tvm.meta_schedule.testing.schedule_rule import get_rules +from tvm.meta_schedule.testing.space_generation import check_sketches +from tvm.script import tir as T +from tvm.tir.tensor_intrin.cuda import get_wmma_intrin_group + + +def multi_level_tiling_tensor_core( + *, + write_reuse_scope="shared", + in_dtype="float16", + out_dtype="float32", + trans_b=False, + use_software_pipeline=False, +) -> ms.schedule_rule.ScheduleRule: + assert write_reuse_scope in ["shared", "global"] + if not isinstance(in_dtype, list): + in_dtype = [in_dtype] + if not isinstance(out_dtype, list): + out_dtype = [out_dtype] + if not isinstance(trans_b, list): + trans_b = [trans_b] + return ms.schedule_rule.MultiLevelTilingTensorCore( + intrin_groups=[ + get_wmma_intrin_group(write_reuse_scope, _in_dtype, _out_dtype, _trans_b) + for _in_dtype in in_dtype + for _out_dtype in out_dtype + for _trans_b in trans_b + ], + structure="SSSRRSRS", + tile_binds=["blockIdx.y", "blockIdx.x", "threadIdx.y"], + max_innermost_factor=4, # 64 // tensor intrin size + vector_load_lens=[1, 2, 3, 4, 8, 16], + reuse_read=ms.schedule_rule.ReuseType( + req="must", + levels=[4], + scope="shared", + ), + reuse_write=ms.schedule_rule.ReuseType( + req="must" if write_reuse_scope == "shared" else "no", + levels=[2], + scope=write_reuse_scope, + ), + use_software_pipeline=use_software_pipeline, + ) + + +def test_matmul_relu(): + # fmt: off + @T.prim_func + def matmul_relu_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 128), "float16"], compute: T.Buffer[(128, 128), "float32"]) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + C_reindex_shared = T.alloc_buffer([128, 128], dtype="float32", scope="shared") + C_reindex_shared_wmma_accumulator = T.alloc_buffer([128, 128], dtype="float32", scope="wmma.accumulator") + A_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared") + B_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared") + A_reindex_shared_wmma_matrix_a = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_a") + B_reindex_shared_wmma_matrix_b = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_b") + for ax0_0_0_ax1_0_0_fused in T.thread_binding(8, thread="blockIdx.y"): + for ax0_0_1_ax1_0_1_fused in T.thread_binding(2, thread="blockIdx.x"): + for ax0_0_2_ax1_0_2_fused in T.thread_binding(2, thread="threadIdx.y"): + for ax2_0_0 in T.serial(1): + for ax0_ax1_fused in T.serial(4096): + with T.block("A_reindex_shared"): + v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0_ax1_fused // 128) + v1 = T.axis.spatial(128, ax0_ax1_fused % 128) + T.reads(A[v0, v1]) + T.writes(A_reindex_shared[v0, v1]) + T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":8}) + A_reindex_shared[v0, v1] = A[v0, v1] + for ax0_ax1_fused in T.serial(4096): + with T.block("B_reindex_shared"): + v0 = T.axis.spatial(128, ax0_ax1_fused // 32) + v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax0_ax1_fused % 32) + T.reads(B[v0, v1]) + T.writes(B_reindex_shared[v0, v1]) + T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":1}) + B_reindex_shared[v0, v1] = B[v0, v1] + for ax2_0_1 in T.serial(4): + for ax0_0, ax1_0 in T.grid(2, 2): + with T.block("A_reindex_shared_wmma.matrix_a_o"): + v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused // 2 * 2 + ax0_0) + v1_o = T.axis.spatial(8, ax2_0_1 * 2 + ax1_0) + T.reads(A_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("A_reindex_shared_wmma.matrix_a"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0_0, ax1_0 in T.grid(2, 1): + with T.block("B_reindex_shared_wmma.matrix_b_o"): + v0_o = T.axis.spatial(8, ax2_0_1 * 2 + ax0_0) + v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused) + T.reads(B_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("B_reindex_shared_wmma.matrix_b"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0_0_3, ax1_0_3, ax2_0_2, ax0_0_4, ax1_0_4 in T.grid(1, 1, 2, 2, 1): + with T.block("C_o"): + v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused // 2 * 2 + ax0_0_3 * 2 + ax0_0_4) + v1_o = T.axis.spatial(8, ax1_0_4 + ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused + ax1_0_3) + v2_o = T.axis.reduce(8, ax2_0_0 * 8 + ax2_0_1 * 2 + ax2_0_2) + T.reads(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16], B_reindex_shared_wmma_matrix_b[v2_o * 16 : v2_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1}) + with T.init(): + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("C_init"): + v0_i_init, v1_i_init = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads() + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init]) + C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init] = T.float32(0) + for ax0_1, ax1_1, ax2_1 in T.grid(16, 16, 16): + with T.block("C"): + v0_i, v1_i, v2_i = T.axis.remap("SSR", [ax0_1, ax1_1, ax2_1]) + T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i], A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i]) + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"}) + C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32") + for ax0_0, ax1_0 in T.grid(2, 1): + with T.block("C_reindex_shared_wmma.accumulator_o"): + v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused // 2 * 2 + ax0_0) + v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused) + T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(C_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("C_reindex_shared_wmma.accumulator"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0, ax1 in T.grid(32, 32): + with T.block("C_reindex_shared"): + v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0) + v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax1) + T.reads(C_reindex_shared[v0, v1]) + T.writes(compute[v0, v1]) + T.block_attr({"meta_schedule.cooperative_fetch":4}) + compute[v0, v1] = T.max(C_reindex_shared[v0, v1], T.float32(0)) + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [4, 1, 1, 1, 2]), + ("SamplePerfectTile", [2, 2, 2, 1, 1]), + ("SamplePerfectTile", [1, 4, 2]), + ("SampleCategorical", 3), + ("SampleCategorical", 3), + ("SampleCategorical", 0), + ] + + mod = te.create_prim_func( + te_workload.matmul_relu( + n=128, + m=128, + k=128, + in_dtype="float16", + out_dtype="float32", + ) + ) + actual = ms.TuneContext( + mod=mod, + target=tvm.target.Target("cuda"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[multi_level_tiling_tensor_core()] + + get_rules("cuda", ms.schedule_rule.AutoInline), + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[matmul_relu_0], + expected_decisions=[decision_0], + ) + + +def test_matmul_relu_with_fallback(): + # fmt: off + @T.prim_func + def matmul_relu_fallback_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 128), "float16"], compute: T.Buffer[(128, 128), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C_reindex_shared = T.alloc_buffer([128, 128], dtype="float32", scope="shared") + C_reindex_shared_wmma_accumulator = T.alloc_buffer([128, 128], dtype="float32", scope="wmma.accumulator") + A_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared") + B_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared") + A_reindex_shared_wmma_matrix_a = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_a") + B_reindex_shared_wmma_matrix_b = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_b") + for ax0_0_0_ax1_0_0_fused in T.thread_binding(2, thread="blockIdx.y"): + for ax0_0_1_ax1_0_1_fused in T.thread_binding(2, thread="blockIdx.x"): + for ax0_0_2_ax1_0_2_fused in T.thread_binding(2, thread="threadIdx.y"): + for ax2_0_0 in T.serial(2): + for ax0_ax1_fused in T.serial(2048): + with T.block("A_reindex_shared"): + v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax0_ax1_fused // 64) + v1 = T.axis.spatial(128, ax2_0_0 * 64 + ax0_ax1_fused % 64) + T.reads(A[v0, v1]) + T.writes(A_reindex_shared[v0, v1]) + T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":4}) + A_reindex_shared[v0, v1] = A[v0, v1] + for ax0_ax1_fused in T.serial(8192): + with T.block("B_reindex_shared"): + v0 = T.axis.spatial(128, ax2_0_0 * 64 + ax0_ax1_fused // 128) + v1 = T.axis.spatial(128, ax0_ax1_fused % 128) + T.reads(B[v0, v1]) + T.writes(B_reindex_shared[v0, v1]) + T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":2}) + B_reindex_shared[v0, v1] = B[v0, v1] + for ax2_0_1 in T.serial(1): + for ax0_0, ax1_0 in T.grid(2, 4): + with T.block("A_reindex_shared_wmma.matrix_a_o"): + v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0) + v1_o = T.axis.spatial(8, ax2_0_0 * 4 + ax1_0) + T.reads(A_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("A_reindex_shared_wmma.matrix_a"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0_0, ax1_0 in T.grid(4, 4): + with T.block("B_reindex_shared_wmma.matrix_b_o"): + v0_o = T.axis.spatial(8, ax2_0_0 * 4 + ax0_0) + v1_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused * 4 + ax1_0) + T.reads(B_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("B_reindex_shared_wmma.matrix_b"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0_0_3, ax1_0_3, ax2_0_2, ax0_0_4, ax1_0_4 in T.grid(1, 1, 4, 2, 4): + with T.block("C_o"): + v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_3 * 2 + ax0_0_4) + v1_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused * 4 + ax1_0_3 * 4 + ax1_0_4) + v2_o = T.axis.reduce(8, ax2_0_0 * 4 + ax2_0_1 * 4 + ax2_0_2) + T.reads(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16], B_reindex_shared_wmma_matrix_b[v2_o * 16 : v2_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1}) + with T.init(): + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("C_init"): + v0_i_init, v1_i_init = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads() + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init]) + C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init] = T.float32(0) + for ax0_1, ax1_1, ax2_1 in T.grid(16, 16, 16): + with T.block("C"): + v0_i, v1_i, v2_i = T.axis.remap("SSR", [ax0_1, ax1_1, ax2_1]) + T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i], A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i]) + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"}) + C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32") + for ax0_0, ax1_0 in T.grid(2, 4): + with T.block("C_reindex_shared_wmma.accumulator_o"): + v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0) + v1_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused * 4 + ax1_0) + T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(C_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("C_reindex_shared_wmma.accumulator"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0, ax1 in T.grid(32, 128): + with T.block("C_reindex_shared"): + v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax0) + v1 = T.axis.spatial(128, ax1) + T.reads(C_reindex_shared[v0, v1]) + T.writes(compute[v0, v1]) + T.block_attr({"meta_schedule.cooperative_fetch":4}) + compute[v0, v1] = T.max(C_reindex_shared[v0, v1], T.float32(0)) + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [2, 2, 1, 1, 2]), + ("SamplePerfectTile", [1, 1, 2, 1, 4]), + ("SamplePerfectTile", [2, 1, 4]), + ("SampleCategorical", 3), + ("SampleCategorical", 2), + ("SampleCategorical", 1), + ] + + mod = te.create_prim_func( + te_workload.matmul_relu( + n=128, + m=128, + k=128, + in_dtype="float16", + out_dtype="float32", + ) + ) + actual = ms.TuneContext( + mod=mod, + target=tvm.target.Target("cuda"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + multi_level_tiling_tensor_core(), + ] + + get_rules( + "cuda", + ( + ms.schedule_rule.MultiLevelTiling, + ms.schedule_rule.AutoInline, + ), + ), + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[matmul_relu_fallback_0], + expected_decisions=[decision_0], + ) + + +def test_conv2d(): + # fmt: off + @T.prim_func + def conv2d_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3, 3, 32, 32), "float16"], conv2d_nhwc: T.Buffer[(1, 16, 16, 32), "float32"]) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + PadInput = T.alloc_buffer([1, 18, 18, 32], dtype="float16") + conv2d_nhwc_reindex_shared = T.alloc_buffer([256, 32], dtype="float32", scope="shared") + conv2d_nhwc_reindex_shared_wmma_accumulator = T.alloc_buffer([256, 32], dtype="float32", scope="wmma.accumulator") + PadInput_reindex_shared = T.alloc_buffer([256, 288], dtype="float16", scope="shared") + weight_reindex_shared = T.alloc_buffer([288, 32], dtype="float16", scope="shared") + PadInput_reindex_shared_wmma_matrix_a = T.alloc_buffer([256, 288], dtype="float16", scope="wmma.matrix_a") + weight_reindex_shared_wmma_matrix_b = T.alloc_buffer([288, 32], dtype="float16", scope="wmma.matrix_b") + for i0, i1, i2, i3 in T.grid(1, 18, 18, 32): + with T.block("PadInput"): + i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) + T.reads(inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) + T.writes(PadInput[i0_1, i1_1, i2_1, i3_1]) + PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 17 and 1 <= i2_1 and i2_1 < 17, inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float16(0), dtype="float16") + for ax0_0_ax1_0_0_ax2_0_0_fused in T.thread_binding(2, thread="blockIdx.y"): + for ax0_1_ax1_0_1_ax2_0_1_fused in T.thread_binding(16, thread="blockIdx.x"): + for ax0_2_ax1_0_2_ax2_0_2_fused in T.thread_binding(1, thread="threadIdx.y"): + for ax3_0_0 in T.serial(1): + for ax0_ax1_fused in T.serial(4608): + with T.block("PadInput_reindex_shared"): + v0 = T.axis.spatial(256, ax0_1_ax1_0_1_ax2_0_1_fused * 16 + ax0_ax1_fused // 288) + v1 = T.axis.spatial(288, ax0_ax1_fused % 288) + T.reads(PadInput[0, v1 // 96 + v0 // 16, v1 % 96 // 32 + v0 % 16, v1 % 32]) + T.writes(PadInput_reindex_shared[v0, v1]) + T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":2}) + PadInput_reindex_shared[v0, v1] = PadInput[0, v1 // 96 + v0 // 16, v1 % 96 // 32 + v0 % 16, v1 % 32] + for ax0_ax1_fused in T.serial(4608): + with T.block("weight_reindex_shared"): + v0 = T.axis.spatial(288, ax0_ax1_fused // 16) + v1 = T.axis.spatial(32, ax0_0_ax1_0_0_ax2_0_0_fused * 16 + ax0_ax1_fused % 16) + T.reads(weight[v0 // 96, v0 % 96 // 32, v0 % 32, v1]) + T.writes(weight_reindex_shared[v0, v1]) + T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":8}) + weight_reindex_shared[v0, v1] = weight[v0 // 96, v0 % 96 // 32, v0 % 32, v1] + for ax3_0_1 in T.serial(18): + for ax0_0, ax1_0 in T.grid(1, 1): + with T.block("PadInput_reindex_shared_wmma.matrix_a_o"): + v0_o, v1_o = T.axis.remap("SS", [ax0_1_ax1_0_1_ax2_0_1_fused, ax3_0_1]) + T.reads(PadInput_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("PadInput_reindex_shared_wmma.matrix_a"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0_0, ax1_0 in T.grid(1, 1): + with T.block("weight_reindex_shared_wmma.matrix_b_o"): + v0_o, v1_o = T.axis.remap("SS", [ax3_0_1, ax0_0_ax1_0_0_ax2_0_0_fused]) + T.reads(weight_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(weight_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("weight_reindex_shared_wmma.matrix_b"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(weight_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(weight_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + weight_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = weight_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0_3, ax1_0_3, ax2_0_3, ax3_0_2, ax0_4, ax1_0_4, ax2_0_4 in T.grid(1, 1, 1, 1, 1, 1, 1): + with T.block("conv2d_nhwc_o"): + v0 = T.axis.spatial(1, 0) + v1_o = T.axis.spatial(16, ax1_0_4 + ax0_1_ax1_0_1_ax2_0_1_fused + ax1_0_3) + v2_o = T.axis.spatial(2, ax0_0_ax1_0_0_ax2_0_0_fused + ax2_0_3 + ax2_0_4) + v3_o = T.axis.reduce(18, ax3_0_0 * 18 + ax3_0_1 + ax3_0_2) + T.reads(PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 : v1_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], weight_reindex_shared_wmma_matrix_b[v3_o * 16 : v3_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16]) + T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1}) + with T.init(): + for ax1_1, ax2_1 in T.grid(16, 16): + with T.block("conv2d_nhwc_init"): + v1_i_init, v2_i_init = T.axis.remap("SS", [ax1_1, ax2_1]) + T.reads() + T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i_init, v2_o * 16 + v2_i_init]) + conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i_init, v2_o * 16 + v2_i_init] = T.float32(0) + for ax1_1, ax2_1, ax3_1 in T.grid(16, 16, 16): + with T.block("conv2d_nhwc"): + v1_i, v2_i, v3_i = T.axis.remap("SSR", [ax1_1, ax2_1, ax3_1]) + T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i], PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 + v1_i, v3_o * 16 + v3_i], weight_reindex_shared_wmma_matrix_b[v3_o * 16 + v3_i, v2_o * 16 + v2_i]) + T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i]) + T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"}) + conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i] + T.cast(PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 + v1_i, v3_o * 16 + v3_i], "float32") * T.cast(weight_reindex_shared_wmma_matrix_b[v3_o * 16 + v3_i, v2_o * 16 + v2_i], "float32") + for ax0_0, ax1_0 in T.grid(1, 1): + with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator_o"): + v0_o, v1_o = T.axis.remap("SS", [ax0_1_ax1_0_1_ax2_0_1_fused, ax0_0_ax1_0_0_ax2_0_0_fused]) + T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0, ax1 in T.grid(16, 16): + with T.block("conv2d_nhwc_reindex_shared"): + v0 = T.axis.spatial(256, ax0_1_ax1_0_1_ax2_0_1_fused * 16 + ax0) + v1 = T.axis.spatial(32, ax0_0_ax1_0_0_ax2_0_0_fused * 16 + ax1) + T.reads(conv2d_nhwc_reindex_shared[v0, v1]) + T.writes(conv2d_nhwc[0, v0 // 16, v0 % 16, v1]) + T.block_attr({"meta_schedule.cooperative_fetch":3}) + conv2d_nhwc[0, v0 // 16, v0 % 16, v1] = conv2d_nhwc_reindex_shared[v0, v1] + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1, 1]), + ("SamplePerfectTile", [1, 16, 1, 1, 1]), + ("SamplePerfectTile", [2, 1, 1, 1, 1]), + ("SamplePerfectTile", [1, 18, 1]), + ("SampleCategorical", 2), + ("SampleCategorical", 1), + ("SampleCategorical", 3), + ] + mod = te.create_prim_func( + te_workload.conv2d_nhwc( + N=1, + H=16, + W=16, + CI=32, + CO=32, + kernel_size=3, + stride=1, + padding=1, + in_dtype="float16", + out_dtype="float32", + ) + ) + actual = ms.TuneContext( + mod=mod, + target=tvm.target.Target("cuda"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[multi_level_tiling_tensor_core()], + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[conv2d_0], + expected_decisions=[decision_0], + ) + + +def test_conv2d_more_intrin(): + # test adding inapplicable tensor intrinsics doesn't change the search space + # fmt: off + @T.prim_func + def conv2d_more_intrin_0(inputs: T.Buffer[(1, 16, 16, 32), "float16"], weight: T.Buffer[(3, 3, 32, 32), "float16"], conv2d_nhwc: T.Buffer[(1, 16, 16, 32), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + PadInput = T.alloc_buffer([1, 18, 18, 32], dtype="float16") + conv2d_nhwc_reindex_shared = T.alloc_buffer([256, 32], dtype="float32", scope="shared") + conv2d_nhwc_reindex_shared_wmma_accumulator = T.alloc_buffer([256, 32], dtype="float32", scope="wmma.accumulator") + PadInput_reindex_shared = T.alloc_buffer([256, 288], dtype="float16", scope="shared") + weight_reindex_shared = T.alloc_buffer([288, 32], dtype="float16", scope="shared") + PadInput_reindex_shared_wmma_matrix_a = T.alloc_buffer([256, 288], dtype="float16", scope="wmma.matrix_a") + weight_reindex_shared_wmma_matrix_b = T.alloc_buffer([288, 32], dtype="float16", scope="wmma.matrix_b") + for i0, i1, i2, i3 in T.grid(1, 18, 18, 32): + with T.block("PadInput"): + i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) + T.reads(inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) + T.writes(PadInput[i0_1, i1_1, i2_1, i3_1]) + PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 17 and 1 <= i2_1 and i2_1 < 17, inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float16(0), dtype="float16") + for ax0_0_ax1_0_0_ax2_0_0_fused in T.thread_binding(4, thread="blockIdx.y"): + for ax0_1_ax1_0_1_ax2_0_1_fused in T.thread_binding(4, thread="blockIdx.x"): + for ax0_2_ax1_0_2_ax2_0_2_fused in T.thread_binding(1, thread="threadIdx.y"): + for ax3_0_0 in T.serial(3): + for ax0_ax1_fused in T.serial(1536): + with T.block("PadInput_reindex_shared"): + v0 = T.axis.spatial(256, ax0_0_ax1_0_0_ax2_0_0_fused * 64 + ax0_1_ax1_0_1_ax2_0_1_fused * 16 + ax0_ax1_fused // 96) + v1 = T.axis.spatial(288, ax3_0_0 * 96 + ax0_ax1_fused % 96) + T.reads(PadInput[0, v1 // 96 + v0 // 16, v1 % 96 // 32 + v0 % 16, v1 % 32]) + T.writes(PadInput_reindex_shared[v0, v1]) + T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":8}) + PadInput_reindex_shared[v0, v1] = PadInput[0, v1 // 96 + v0 // 16, v1 % 96 // 32 + v0 % 16, v1 % 32] + for ax0_ax1_fused in T.serial(3072): + with T.block("weight_reindex_shared"): + v0 = T.axis.spatial(288, ax3_0_0 * 96 + ax0_ax1_fused // 32) + v1 = T.axis.spatial(32, ax0_ax1_fused % 32) + T.reads(weight[v0 // 96, v0 % 96 // 32, v0 % 32, v1]) + T.writes(weight_reindex_shared[v0, v1]) + T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":8}) + weight_reindex_shared[v0, v1] = weight[v0 // 96, v0 % 96 // 32, v0 % 32, v1] + for ax3_0_1 in T.serial(2): + for ax0_0, ax1_0 in T.grid(1, 3): + with T.block("PadInput_reindex_shared_wmma.matrix_a_o"): + v0_o = T.axis.spatial(16, ax0_0_ax1_0_0_ax2_0_0_fused * 4 + ax0_1_ax1_0_1_ax2_0_1_fused) + v1_o = T.axis.spatial(18, ax3_0_0 * 6 + ax3_0_1 * 3 + ax1_0) + T.reads(PadInput_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("PadInput_reindex_shared_wmma.matrix_a"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + PadInput_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = PadInput_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0_0, ax1_0 in T.grid(3, 2): + with T.block("weight_reindex_shared_wmma.matrix_b_o"): + v0_o = T.axis.spatial(18, ax3_0_0 * 6 + ax3_0_1 * 3 + ax0_0) + v1_o = T.axis.spatial(2, ax1_0) + T.reads(weight_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(weight_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("weight_reindex_shared_wmma.matrix_b"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(weight_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(weight_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + weight_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = weight_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0_3, ax1_0_3, ax2_0_3, ax3_0_2, ax0_4, ax1_0_4, ax2_0_4 in T.grid(1, 1, 2, 3, 1, 1, 1): + with T.block("conv2d_nhwc_o"): + v0 = T.axis.spatial(1, 0) + v1_o = T.axis.spatial(16, ax1_0_4 + ax0_0_ax1_0_0_ax2_0_0_fused * 4 + ax0_1_ax1_0_1_ax2_0_1_fused + ax1_0_3) + v2_o = T.axis.spatial(2, ax2_0_4 + ax2_0_3) + v3_o = T.axis.reduce(18, ax3_0_0 * 6 + ax3_0_1 * 3 + ax3_0_2) + T.reads(PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 : v1_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16], weight_reindex_shared_wmma_matrix_b[v3_o * 16 : v3_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16]) + T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1}) + with T.init(): + for ax1_1, ax2_1 in T.grid(16, 16): + with T.block("conv2d_nhwc_init"): + v1_i_init, v2_i_init = T.axis.remap("SS", [ax1_1, ax2_1]) + T.reads() + T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i_init, v2_o * 16 + v2_i_init]) + conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i_init, v2_o * 16 + v2_i_init] = T.float32(0) + for ax1_1, ax2_1, ax3_1 in T.grid(16, 16, 16): + with T.block("conv2d_nhwc"): + v1_i, v2_i, v3_i = T.axis.remap("SSR", [ax1_1, ax2_1, ax3_1]) + T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i], PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 + v1_i, v3_o * 16 + v3_i], weight_reindex_shared_wmma_matrix_b[v3_o * 16 + v3_i, v2_o * 16 + v2_i]) + T.writes(conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i]) + T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"}) + conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v1_o * 16 + v1_i, v2_o * 16 + v2_i] + T.cast(PadInput_reindex_shared_wmma_matrix_a[v1_o * 16 + v1_i, v3_o * 16 + v3_i], "float32") * T.cast(weight_reindex_shared_wmma_matrix_b[v3_o * 16 + v3_i, v2_o * 16 + v2_i], "float32") + for ax0_0, ax1_0 in T.grid(1, 2): + with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator_o"): + v0_o = T.axis.spatial(16, ax0_0_ax1_0_0_ax2_0_0_fused * 4 + ax0_1_ax1_0_1_ax2_0_1_fused) + v1_o = T.axis.spatial(2, ax1_0) + T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("conv2d_nhwc_reindex_shared_wmma.accumulator"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + conv2d_nhwc_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = conv2d_nhwc_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0, ax1 in T.grid(16, 32): + with T.block("conv2d_nhwc_reindex_shared"): + v0 = T.axis.spatial(256, ax0_0_ax1_0_0_ax2_0_0_fused * 64 + ax0_1_ax1_0_1_ax2_0_1_fused * 16 + ax0) + v1 = T.axis.spatial(32, ax1) + T.reads(conv2d_nhwc_reindex_shared[v0, v1]) + T.writes(conv2d_nhwc[0, v0 // 16, v0 % 16, v1]) + T.block_attr({"meta_schedule.cooperative_fetch":3}) + conv2d_nhwc[0, v0 // 16, v0 % 16, v1] = conv2d_nhwc_reindex_shared[v0, v1] + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1, 1, 1]), + ("SamplePerfectTile", [4, 4, 1, 1, 1]), + ("SamplePerfectTile", [1, 1, 1, 2, 1]), + ("SamplePerfectTile", [3, 2, 3]), + ("SampleCategorical", 2), + ("SampleCategorical", 3), + ("SampleCategorical", 3), + ] + + mod = te.create_prim_func( + te_workload.conv2d_nhwc( + N=1, + H=16, + W=16, + CI=32, + CO=32, + kernel_size=3, + stride=1, + padding=1, + in_dtype="float16", + out_dtype="float32", + ) + ) + actual = ms.TuneContext( + mod=mod, + target=tvm.target.Target("cuda"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + multi_level_tiling_tensor_core( + in_dtype="float16", + out_dtype=["float16", "float32"], + ), + ], + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[conv2d_more_intrin_0], + expected_decisions=[decision_0], + ) + + +def test_matmul_relu_pipeline(): + # fmt: off + @T.prim_func + def matmul_relu_pipeline_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 128), "float16"], compute: T.Buffer[(128, 128), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C = T.alloc_buffer([128, 128], dtype="float32") + C_reindex_shared = T.alloc_buffer([128, 128], dtype="float32", scope="shared") + C_reindex_shared_wmma_accumulator = T.alloc_buffer([128, 128], dtype="float32", scope="wmma.accumulator") + A_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared") + B_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared") + A_reindex_shared_wmma_matrix_a = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_a") + B_reindex_shared_wmma_matrix_b = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_b") + for ax0_0_0_ax1_0_0_fused in T.thread_binding(1, thread="blockIdx.y"): + for ax0_0_1_ax1_0_1_fused in T.thread_binding(16, thread="blockIdx.x"): + for ax0_0_2_ax1_0_2_fused in T.thread_binding(1, thread="threadIdx.y"): + for ax2_0_0 in T.serial(4, annotations={"software_pipeline_order":[0, 3, 1, 4, 5, 2, 6], "software_pipeline_stage":[0, 0, 0, 0, 0, 1, 1]}): + for ax0_ax1_fused in T.serial(1024): + with T.block("A_reindex_shared"): + v0 = T.axis.spatial(128, ax0_0_1_ax1_0_1_fused // 4 * 32 + ax0_ax1_fused // 32) + v1 = T.axis.spatial(128, ax2_0_0 * 32 + ax0_ax1_fused % 32) + T.reads(A[v0, v1]) + T.writes(A_reindex_shared[v0, v1]) + T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "double_buffer_scope":0, "meta_schedule.cooperative_fetch":4, "tir.manifest_shared_memory_local_stage":1}) + A_reindex_shared[v0, v1] = A[v0, v1] + for ax0_ax1_fused in T.serial(1024): + with T.block("B_reindex_shared"): + v0 = T.axis.spatial(128, ax2_0_0 * 32 + ax0_ax1_fused // 32) + v1 = T.axis.spatial(128, ax0_0_1_ax1_0_1_fused % 4 * 32 + ax0_ax1_fused % 32) + T.reads(B[v0, v1]) + T.writes(B_reindex_shared[v0, v1]) + T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "double_buffer_scope":0, "meta_schedule.cooperative_fetch":2, "tir.manifest_shared_memory_local_stage":1}) + B_reindex_shared[v0, v1] = B[v0, v1] + for ax2_0_1 in T.serial(2, annotations={"software_pipeline_order":[0, 1, 2], "software_pipeline_stage":[0, 0, 1]}): + for ax0_0, ax1_0 in T.grid(2, 1): + with T.block("A_reindex_shared_wmma.matrix_a_o"): + v0_o = T.axis.spatial(8, ax0_0_1_ax1_0_1_fused // 4 * 2 + ax0_0) + v1_o = T.axis.spatial(8, ax2_0_0 * 2 + ax2_0_1) + T.reads(A_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("A_reindex_shared_wmma.matrix_a"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0_0, ax1_0 in T.grid(1, 2): + with T.block("B_reindex_shared_wmma.matrix_b_o"): + v0_o = T.axis.spatial(8, ax2_0_0 * 2 + ax2_0_1) + v1_o = T.axis.spatial(8, ax0_0_1_ax1_0_1_fused % 4 * 2 + ax1_0) + T.reads(B_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("B_reindex_shared_wmma.matrix_b"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0_0_3, ax1_0_3, ax2_0_2, ax0_0_4, ax1_0_4 in T.grid(1, 1, 1, 2, 2): + with T.block("C_o"): + v0_o = T.axis.spatial(8, ax0_0_1_ax1_0_1_fused // 4 * 2 + ax0_0_3 * 2 + ax0_0_4) + v1_o = T.axis.spatial(8, ax0_0_1_ax1_0_1_fused % 4 * 2 + ax1_0_3 * 2 + ax1_0_4) + v2_o = T.axis.reduce(8, ax2_0_0 * 2 + ax2_0_1 + ax2_0_2) + T.reads(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16], B_reindex_shared_wmma_matrix_b[v2_o * 16 : v2_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1}) + with T.init(): + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("C_init"): + v0_i_init, v1_i_init = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads() + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init]) + C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init] = T.float32(0) + for ax0_1, ax1_1, ax2_1 in T.grid(16, 16, 16): + with T.block("C"): + v0_i, v1_i, v2_i = T.axis.remap("SSR", [ax0_1, ax1_1, ax2_1]) + T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i], A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i]) + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"}) + C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32") + for ax0_0, ax1_0 in T.grid(2, 2): + with T.block("C_reindex_shared_wmma.accumulator_o"): + v0_o = T.axis.spatial(8, ax0_0_1_ax1_0_1_fused // 4 * 2 + ax0_0) + v1_o = T.axis.spatial(8, ax0_0_1_ax1_0_1_fused % 4 * 2 + ax1_0) + T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(C_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("C_reindex_shared_wmma.accumulator"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0, ax1 in T.grid(32, 32): + with T.block("C_reindex_shared"): + v0 = T.axis.spatial(128, ax0_0_1_ax1_0_1_fused // 4 * 32 + ax0) + v1 = T.axis.spatial(128, ax0_0_1_ax1_0_1_fused % 4 * 32 + ax1) + T.reads(C_reindex_shared[v0, v1]) + T.writes(C[v0, v1]) + T.block_attr({"meta_schedule.cooperative_fetch":3}) + C[v0, v1] = C_reindex_shared[v0, v1] + for i0, i1 in T.grid(128, 128): + with T.block("compute"): + i0_1, i1_1 = T.axis.remap("SS", [i0, i1]) + T.reads(C[i0_1, i1_1]) + T.writes(compute[i0_1, i1_1]) + compute[i0_1, i1_1] = T.max(C[i0_1, i1_1], T.float32(0)) + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [1, 4, 1, 1, 2]), + ("SamplePerfectTile", [1, 4, 1, 1, 2]), + ("SamplePerfectTile", [4, 2, 1]), + ("SampleCategorical", 2), + ("SampleCategorical", 2), + ("SampleCategorical", 1), + ] + mod = te.create_prim_func( + te_workload.matmul_relu( + n=128, + m=128, + k=128, + in_dtype="float16", + out_dtype="float32", + ) + ) + actual = ms.TuneContext( + mod=mod, + target=tvm.target.Target("cuda"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + multi_level_tiling_tensor_core( + use_software_pipeline=True, + ), + ], + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[matmul_relu_pipeline_0], + expected_decisions=[decision_0], + ) + + +def test_matmul_relu_global(): + # fmt: off + @T.prim_func + def matmul_relu_global_0(A: T.Buffer[(128, 128), "float16"], B: T.Buffer[(128, 128), "float16"], compute: T.Buffer[(128, 128), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C = T.alloc_buffer([128, 128], dtype="float32") + C_reindex_wmma_accumulator = T.alloc_buffer([128, 128], dtype="float32", scope="wmma.accumulator") + A_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared") + B_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared") + A_reindex_shared_wmma_matrix_a = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_a") + B_reindex_shared_wmma_matrix_b = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_b") + for ax0_0_0_ax1_0_0_fused in T.thread_binding(1, thread="blockIdx.y"): + for ax0_0_1_ax1_0_1_fused in T.thread_binding(1, thread="blockIdx.x"): + for ax0_0_2_ax1_0_2_fused in T.thread_binding(16, thread="threadIdx.y"): + for ax2_0_0 in T.serial(2): + for ax0_ax1_fused in T.serial(8192): + with T.block("A_reindex_shared"): + v0 = T.axis.spatial(128, ax0_ax1_fused // 64) + v1 = T.axis.spatial(128, ax2_0_0 * 64 + ax0_ax1_fused % 64) + T.reads(A[v0, v1]) + T.writes(A_reindex_shared[v0, v1]) + T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":1}) + A_reindex_shared[v0, v1] = A[v0, v1] + for ax0_ax1_fused in T.serial(8192): + with T.block("B_reindex_shared"): + v0 = T.axis.spatial(128, ax2_0_0 * 64 + ax0_ax1_fused // 128) + v1 = T.axis.spatial(128, ax0_ax1_fused % 128) + T.reads(B[v0, v1]) + T.writes(B_reindex_shared[v0, v1]) + T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":1}) + B_reindex_shared[v0, v1] = B[v0, v1] + for ax2_0_1 in T.serial(2): + for ax0_0, ax1_0 in T.grid(1, 2): + with T.block("A_reindex_shared_wmma.matrix_a_o"): + v0_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused // 2) + v1_o = T.axis.spatial(8, ax2_0_0 * 4 + ax2_0_1 * 2 + ax1_0) + T.reads(A_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("A_reindex_shared_wmma.matrix_a"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0_0, ax1_0 in T.grid(2, 4): + with T.block("B_reindex_shared_wmma.matrix_b_o"): + v0_o = T.axis.spatial(8, ax2_0_0 * 4 + ax2_0_1 * 2 + ax0_0) + v1_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused % 2 * 4 + ax1_0) + T.reads(B_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("B_reindex_shared_wmma.matrix_b"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0_0_3, ax1_0_3, ax2_0_2, ax0_0_4, ax1_0_4 in T.grid(1, 4, 2, 1, 1): + with T.block("C_o"): + v0_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused // 2 + ax0_0_3 + ax0_0_4) + v1_o = T.axis.spatial(8, ax1_0_4 + ax0_0_2_ax1_0_2_fused % 2 * 4 + ax1_0_3) + v2_o = T.axis.reduce(8, ax2_0_0 * 4 + ax2_0_1 * 2 + ax2_0_2) + T.reads(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16], B_reindex_shared_wmma_matrix_b[v2_o * 16 : v2_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(C_reindex_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1}) + with T.init(): + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("C_init"): + v0_i_init, v1_i_init = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads() + T.writes(C_reindex_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init]) + C_reindex_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init] = T.float32(0) + for ax0_1, ax1_1, ax2_1 in T.grid(16, 16, 16): + with T.block("C"): + v0_i, v1_i, v2_i = T.axis.remap("SSR", [ax0_1, ax1_1, ax2_1]) + T.reads(C_reindex_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i], A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i]) + T.writes(C_reindex_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"}) + C_reindex_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32") + for ax0_0, ax1_0 in T.grid(1, 4): + with T.block("C_reindex_wmma.accumulator_o"): + v0_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused // 2) + v1_o = T.axis.spatial(8, ax0_0_2_ax1_0_2_fused % 2 * 4 + ax1_0) + T.reads(C_reindex_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(C[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_global"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("C_reindex_wmma.accumulator"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(C_reindex_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(C[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + C[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for i0, i1 in T.grid(128, 128): + with T.block("compute"): + i0_1, i1_1 = T.axis.remap("SS", [i0, i1]) + T.reads(C[i0_1, i1_1]) + T.writes(compute[i0_1, i1_1]) + compute[i0_1, i1_1] = T.max(C[i0_1, i1_1], T.float32(0)) + # fmt: on + decision_0 = [ + ("SamplePerfectTile", [1, 1, 8, 1, 1]), + ("SamplePerfectTile", [1, 1, 2, 4, 1]), + ("SamplePerfectTile", [2, 2, 2]), + ("SampleCategorical", 0), + ("SampleCategorical", 0), + ] + mod = te.create_prim_func( + te_workload.matmul_relu( + n=128, + m=128, + k=128, + in_dtype="float16", + out_dtype="float32", + ) + ) + actual = ms.TuneContext( + mod=mod, + target=tvm.target.Target("cuda"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[multi_level_tiling_tensor_core(write_reuse_scope="global")] + + get_rules("cuda", ms.schedule_rule.AutoInline), + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[matmul_relu_global_0], + expected_decisions=[decision_0], + ) + + +def test_matmul_relu_non_tensorizable(): + # expected to do nothing on non-tensorizable workloads + mod = te.create_prim_func( + te_workload.matmul_relu( # dtype doesn't match tensor intrin + n=128, + m=128, + k=128, + ) + ) + (sch,) = ms.TuneContext( + mod=mod, + target=tvm.target.Target("cuda"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[multi_level_tiling_tensor_core(write_reuse_scope="global")] + + get_rules("cuda", ms.schedule_rule.AutoInline), + ).generate_design_space() + tvm.ir.assert_structural_equal(mod, sch.mod["main"]) + + +if __name__ == "__main__": + test_matmul_relu() + test_matmul_relu_with_fallback() + test_conv2d() + test_conv2d_more_intrin() + test_matmul_relu_pipeline() + test_matmul_relu_global() + test_matmul_relu_non_tensorizable() diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py deleted file mode 100644 index fe1220c50925..000000000000 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py +++ /dev/null @@ -1,1205 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring -import tvm -import tvm.testing -from tvm import te -from tvm.meta_schedule import schedule_rule -from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply -from tvm.meta_schedule.testing import te_workload -from tvm.meta_schedule.testing.schedule_rule import ( - auto_inline, - multi_level_tiling, - multi_level_tiling_tensor_core, -) -from tvm.meta_schedule.testing.space_generation import check_trace -from tvm.meta_schedule.tune_context import TuneContext -from tvm.script import tir as T -from tvm.target import Target -from tvm.te import create_prim_func -from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN -from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN - - -def _create_context(mod, target, rule) -> TuneContext: - if not isinstance(rule, (list, tuple)): - rule = [rule] - ctx = TuneContext( - mod=mod, - target=target, - space_generator=PostOrderApply(), - sch_rules=rule, - task_name="test", - ) - return ctx - - -def test_cpu_matmul(): - expected = [ - [ - 'b0 = sch.get_block(name="C", func_name="main")', - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")', - "l1, l2, l3 = sch.get_loops(block=b0)", - "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)", - "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)", - "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)", - "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)", - "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)", - "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)", - "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)", - 'b24 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="global")', - "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True, index=-1)", - ], - [ - 'b0 = sch.get_block(name="C", func_name="main")', - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")', - "l1, l2, l3 = sch.get_loops(block=b0)", - "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)", - "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)", - "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)", - "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)", - "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)", - "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)", - "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)", - 'b24 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="global")', - "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True, index=-1)", - ], - [ - 'b0 = sch.get_block(name="C", func_name="main")', - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")', - "l1, l2, l3 = sch.get_loops(block=b0)", - "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)", - "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)", - "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)", - "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)", - "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)", - "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)", - "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)", - ], - ] - target = Target("llvm") - ctx = _create_context( - create_prim_func( - te_workload.matmul( - n=512, - m=512, - k=512, - ) - ), - target=target, - rule=multi_level_tiling(target=target), - ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 3 - check_trace(spaces, expected) - - -def test_cpu_matmul_relu(): - # pylint: disable=line-too-long - expected = [ - [ - 'b0 = sch.get_block(name="C", func_name="main")', - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")', - "l1, l2, l3 = sch.get_loops(block=b0)", - "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)", - "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)", - "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)", - "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)", - "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)", - "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)", - "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)", - "b24, = sch.get_consumers(block=b0)", - "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True, index=-1)", - ], - [ - 'b0 = sch.get_block(name="C", func_name="main")', - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")', - "l1, l2, l3 = sch.get_loops(block=b0)", - "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)", - "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)", - "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)", - "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)", - "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)", - "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)", - "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)", - "b24, = sch.get_consumers(block=b0)", - "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True, index=-1)", - ], - [ - 'b0 = sch.get_block(name="C", func_name="main")', - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")', - "l1, l2, l3 = sch.get_loops(block=b0)", - "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)", - "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)", - "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)", - "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)", - "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)", - "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)", - "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)", - ], - ] - # pylint: enable=line-too-long - target = Target("llvm") - ctx = _create_context( - create_prim_func( - te_workload.matmul_relu( - n=512, - m=512, - k=512, - ) - ), - target=target, - rule=multi_level_tiling(target=target), - ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 3 - check_trace(spaces, expected) - - -def test_cuda_matmul(): - # pylint: disable=line-too-long - expected = [ - [ - 'b0 = sch.get_block(name="C", func_name="main")', - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")', - "l1, l2, l3 = sch.get_loops(block=b0)", - "v4, v5, v6, v7, v8 = sch.sample_perfect_tile(loop=l1, n=5, max_innermost_factor=64)", - "l9, l10, l11, l12, l13 = sch.split(loop=l1, factors=[v4, v5, v6, v7, v8], preserve_unit_iters=True)", - "v14, v15, v16, v17, v18 = sch.sample_perfect_tile(loop=l2, n=5, max_innermost_factor=64)", - "l19, l20, l21, l22, l23 = sch.split(loop=l2, factors=[v14, v15, v16, v17, v18], preserve_unit_iters=True)", - "v24, v25, v26 = sch.sample_perfect_tile(loop=l3, n=3, max_innermost_factor=64)", - "l27, l28, l29 = sch.split(loop=l3, factors=[v24, v25, v26], preserve_unit_iters=True)", - "sch.reorder(l9, l19, l10, l20, l11, l21, l27, l28, l12, l22, l29, l13, l23)", - "l30 = sch.fuse(l9, l19, preserve_unit_iters=True)", - 'sch.bind(loop=l30, thread_axis="blockIdx.x")', - "l31 = sch.fuse(l10, l20, preserve_unit_iters=True)", - 'sch.bind(loop=l31, thread_axis="vthread.x")', - "l32 = sch.fuse(l11, l21, preserve_unit_iters=True)", - 'sch.bind(loop=l32, thread_axis="threadIdx.x")', - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32)', - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024)', - 'b33 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")', - "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True, index=-1)", - 'b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")', - "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True, index=-1)", - "l35, l36, l37, l38, l39, l40 = sch.get_loops(block=b34)", - "l41 = sch.fuse(l39, l40, preserve_unit_iters=True)", - "v42 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])", - 'sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v42)', - 'b43 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")', - "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True, index=-1)", - "l44, l45, l46, l47, l48, l49 = sch.get_loops(block=b43)", - "l50 = sch.fuse(l48, l49, preserve_unit_iters=True)", - "v51 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])", - 'sch.annotate(block_or_loop=b43, ann_key="meta_schedule.cooperative_fetch", ann_val=v51)', - ] - ] - # pylint: enable=line-too-long - target = Target("cuda --max_threads_per_block=1024 --thread_warp_size=32", host="llvm") - ctx = _create_context( - create_prim_func( - te_workload.matmul( - n=512, - m=512, - k=512, - ) - ), - target=target, - rule=multi_level_tiling(target=target), - ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 1 - check_trace(spaces, expected) - - -def test_cuda_matmul_relu(): - # pylint: disable=line-too-long - expected = [ - [ - 'b0 = sch.get_block(name="C", func_name="main")', - 'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")', - "l1, l2, l3 = sch.get_loops(block=b0)", - "v4, v5, v6, v7, v8 = sch.sample_perfect_tile(loop=l1, n=5, max_innermost_factor=64)", - "l9, l10, l11, l12, l13 = sch.split(loop=l1, factors=[v4, v5, v6, v7, v8], preserve_unit_iters=True)", - "v14, v15, v16, v17, v18 = sch.sample_perfect_tile(loop=l2, n=5, max_innermost_factor=64)", - "l19, l20, l21, l22, l23 = sch.split(loop=l2, factors=[v14, v15, v16, v17, v18], preserve_unit_iters=True)", - "v24, v25, v26 = sch.sample_perfect_tile(loop=l3, n=3, max_innermost_factor=64)", - "l27, l28, l29 = sch.split(loop=l3, factors=[v24, v25, v26], preserve_unit_iters=True)", - "sch.reorder(l9, l19, l10, l20, l11, l21, l27, l28, l12, l22, l29, l13, l23)", - "l30 = sch.fuse(l9, l19, preserve_unit_iters=True)", - 'sch.bind(loop=l30, thread_axis="blockIdx.x")', - "l31 = sch.fuse(l10, l20, preserve_unit_iters=True)", - 'sch.bind(loop=l31, thread_axis="vthread.x")', - "l32 = sch.fuse(l11, l21, preserve_unit_iters=True)", - 'sch.bind(loop=l32, thread_axis="threadIdx.x")', - 'b33 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")', - "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True, index=-1)", - 'b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")', - "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True, index=-1)", - "l35, l36, l37, l38, l39, l40 = sch.get_loops(block=b34)", - "l41 = sch.fuse(l39, l40, preserve_unit_iters=True)", - "v42 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])", - 'sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v42)', - 'b43 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")', - "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True, index=-1)", - "l44, l45, l46, l47, l48, l49 = sch.get_loops(block=b43)", - "l50 = sch.fuse(l48, l49, preserve_unit_iters=True)", - "v51 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])", - 'sch.annotate(block_or_loop=b43, ann_key="meta_schedule.cooperative_fetch", ann_val=v51)', - ] - ] - # pylint: enable=line-too-long - target = Target("cuda", host="llvm") - ctx = _create_context( - create_prim_func( - te_workload.matmul_relu( - n=512, - m=512, - k=512, - ) - ), - target=target, - rule=multi_level_tiling(target=target), - ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 1 - check_trace(spaces, expected) - - -def test_cuda_sum_with_trivial_block_iter(): - @T.prim_func - def sum_with_trivial_block_iter( - A: T.Buffer[(1, 64, 768), "float32"], B: T.Buffer[(1, 64, 1), "float32"] - ) -> None: - for i0, i1, i2, i3 in T.grid(1, 64, 1, 768): - with T.block("sum"): - ax0, ax1, ax2, k2 = T.axis.remap("SSSR", [i0, i1, i2, i3]) - T.reads(A[ax0, ax1, k2]) - T.writes(B[ax0, ax1, ax2]) - with T.init(): - B[ax0, ax1, ax2] = T.float32(0) - B[ax0, ax1, ax2] = B[ax0, ax1, ax2] + A[ax0, ax1, k2] - - # Expect nothing to happen - the rule is not supposed to be applied in this case - expected = [[]] - target = Target("cuda", host="llvm") - ctx = _create_context( - sum_with_trivial_block_iter, - target=target, - rule=multi_level_tiling(target=target), - ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 1 - check_trace(spaces, expected) - - -@tvm.script.ir_module -class Conv2dNCHWcVNNIModule: - @T.prim_func - def main( - placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], - placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], - conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"], - ) -> None: - T.func_attr({"global_symbol": "main", "tir.noalias": True}) - for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4): - with T.block("conv2d_NCHWc_int8"): - ( - n, - oc_chunk, - oh, - ow, - oc_block, - kh, - kw, - ic_outer, - ic_f_inner, - ic_s_inner, - ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9]) - T.reads( - placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], - placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner], - ) - T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block]) - with T.init(): - conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0 - conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[ - n, oc_chunk, oh, ow, oc_block - ] + T.cast( - placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32" - ) * T.cast( - placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner], - "int32", - ) - - -def test_multi_level_tiling_conv2d_nchwc_vnni(): - target = "llvm -mcpu=cascadelake -num-cores 4" - ctx = _create_context( - Conv2dNCHWcVNNIModule, - target=tvm.target.Target(target), - rule=schedule_rule.MultiLevelTilingWithIntrin( - VNNI_INTRIN, - structure="SSRSRS", - tile_binds=None, - max_innermost_factor=64, - vector_load_lens=None, - reuse_read=None, - reuse_write=schedule_rule.ReuseType( - req="may", - levels=[1, 2], - scope="global", - ), - ), - ) - - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - - expected = [ - """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main") -sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS") -l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0) -l11, l12 = sch.split(loop=l10, factors=[None, 4], preserve_unit_iters=True) -l13, l14 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True) -l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0) -sch.reorder(l21, l22, l23, l24, l25, l14, l12) -b27 = sch.blockize(loop=l14) -sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni") -l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27) -v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64) -l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41], preserve_unit_iters=True) -v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64) -l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49], preserve_unit_iters=True) -v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64) -l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57], preserve_unit_iters=True) -v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64) -l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65], preserve_unit_iters=True) -v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64) -l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73], preserve_unit_iters=True) -v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64) -l80, l81 = sch.split(loop=l33, factors=[v78, v79], preserve_unit_iters=True) -v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64) -l84, l85 = sch.split(loop=l34, factors=[v82, v83], preserve_unit_iters=True) -v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64) -l88, l89 = sch.split(loop=l35, factors=[v86, v87], preserve_unit_iters=True) -v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64) -l92, l93 = sch.split(loop=l36, factors=[v90, v91], preserve_unit_iters=True) -v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64) -l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True) -sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77) -b98 = sch.cache_write(block=b27, write_buffer_index=0, storage_scope="global") -sch.reverse_compute_at(block=b98, loop=l75, preserve_unit_loops=True, index=-1)""".split( - "\n" - ), - """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main") -sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS") -l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0) -l11, l12 = sch.split(loop=l10, factors=[None, 4], preserve_unit_iters=True) -l13, l14 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True) -l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0) -sch.reorder(l21, l22, l23, l24, l25, l14, l12) -b27 = sch.blockize(loop=l14) -sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni") -l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27) -v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64) -l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41], preserve_unit_iters=True) -v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64) -l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49], preserve_unit_iters=True) -v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64) -l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57], preserve_unit_iters=True) -v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64) -l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65], preserve_unit_iters=True) -v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64) -l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73], preserve_unit_iters=True) -v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64) -l80, l81 = sch.split(loop=l33, factors=[v78, v79], preserve_unit_iters=True) -v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64) -l84, l85 = sch.split(loop=l34, factors=[v82, v83], preserve_unit_iters=True) -v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64) -l88, l89 = sch.split(loop=l35, factors=[v86, v87], preserve_unit_iters=True) -v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64) -l92, l93 = sch.split(loop=l36, factors=[v90, v91], preserve_unit_iters=True) -v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64) -l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True) -sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77) -b98 = sch.cache_write(block=b27, write_buffer_index=0, storage_scope="global") -sch.reverse_compute_at(block=b98, loop=l74, preserve_unit_loops=True, index=-1)""".split( - "\n" - ), - """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main") -sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS") -l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0) -l11, l12 = sch.split(loop=l10, factors=[None, 4], preserve_unit_iters=True) -l13, l14 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True) -l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0) -sch.reorder(l21, l22, l23, l24, l25, l14, l12) -b27 = sch.blockize(loop=l14) -sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni") -l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27) -v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64) -l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41], preserve_unit_iters=True) -v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64) -l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49], preserve_unit_iters=True) -v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64) -l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57], preserve_unit_iters=True) -v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64) -l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65], preserve_unit_iters=True) -v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64) -l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73], preserve_unit_iters=True) -v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64) -l80, l81 = sch.split(loop=l33, factors=[v78, v79], preserve_unit_iters=True) -v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64) -l84, l85 = sch.split(loop=l34, factors=[v82, v83], preserve_unit_iters=True) -v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64) -l88, l89 = sch.split(loop=l35, factors=[v86, v87], preserve_unit_iters=True) -v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64) -l92, l93 = sch.split(loop=l36, factors=[v90, v91], preserve_unit_iters=True) -v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64) -l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True) -sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)""".split( - "\n" - ), - ] - - check_trace(spaces, expected) - - -def _test_multi_level_tiling_dense_dp4a(m, n, k, in_dtype, out_dtype, expected): - X = te.placeholder((m, k), name="X", dtype=in_dtype) - W = te.placeholder((n, k), name="W", dtype=in_dtype) - ak = te.reduce_axis((0, k), name="k") - - matmul = te.compute( - (m, n), - lambda i, j: te.sum( - X[i, ak].astype(out_dtype) * W[j, ak].astype(out_dtype), - axis=ak, - ), - name="compute", - ) - - func = te.create_prim_func([X, W, matmul]) - - ctx = _create_context( - func, - target=tvm.target.Target("cuda"), - rule=schedule_rule.MultiLevelTilingWithIntrin( - DP4A_INTRIN, - structure="SSSRRSRS", - tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"], - max_innermost_factor=64, - vector_load_lens=[1, 2, 3, 4], - reuse_read=schedule_rule.ReuseType( - req="must", - levels=[4], - scope="shared", - ), - reuse_write=schedule_rule.ReuseType( - req="must", - levels=[3], - scope="local", - ), - ), - ) - - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - check_trace(spaces, expected) - - -def test_multi_level_tiling_dense_dp4a(): - m, n, k = 128, 128, 128 - - expected = [ - """b0 = sch.get_block(name="compute", func_name="main") -sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") -l1, l2, l3 = sch.get_loops(block=b0) -l4, l5 = sch.split(loop=l3, factors=[None, 4], preserve_unit_iters=True) -sch.reorder(l5) -b6 = sch.blockize(loop=l5) -sch.annotate(block_or_loop=b6, ann_key="meta_schedule.auto_tensorize", ann_val="dp4a") -l7, l8, l9 = sch.get_loops(block=b6) -v10, v11, v12, v13, v14 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64) -l15, l16, l17, l18, l19 = sch.split(loop=l7, factors=[v10, v11, v12, v13, v14], preserve_unit_iters=True) -v20, v21, v22, v23, v24 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64) -l25, l26, l27, l28, l29 = sch.split(loop=l8, factors=[v20, v21, v22, v23, v24], preserve_unit_iters=True) -v30, v31, v32 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64) -l33, l34, l35 = sch.split(loop=l9, factors=[v30, v31, v32], preserve_unit_iters=True) -sch.reorder(l15, l25, l16, l26, l17, l27, l33, l34, l18, l28, l35, l19, l29) -l36 = sch.fuse(l15, l25, preserve_unit_iters=True) -sch.bind(loop=l36, thread_axis="blockIdx.x") -l37 = sch.fuse(l16, l26, preserve_unit_iters=True) -sch.bind(loop=l37, thread_axis="vthread.x") -l38 = sch.fuse(l17, l27, preserve_unit_iters=True) -sch.bind(loop=l38, thread_axis="threadIdx.x") -b39 = sch.cache_write(block=b6, write_buffer_index=0, storage_scope="local") -sch.reverse_compute_at(block=b39, loop=l38, preserve_unit_loops=True, index=-1) -b40 = sch.cache_read(block=b6, read_buffer_index=0, storage_scope="shared") -sch.compute_at(block=b40, loop=l33, preserve_unit_loops=True, index=-1) -l41, l42, l43, l44, l45, l46 = sch.get_loops(block=b40) -l47 = sch.fuse(l45, l46, preserve_unit_iters=True) -v48 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b40, ann_key="meta_schedule.cooperative_fetch", ann_val=v48) -b49 = sch.cache_read(block=b6, read_buffer_index=1, storage_scope="shared") -sch.compute_at(block=b49, loop=l33, preserve_unit_loops=True, index=-1) -l50, l51, l52, l53, l54, l55 = sch.get_loops(block=b49) -l56 = sch.fuse(l54, l55, preserve_unit_iters=True) -v57 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b49, ann_key="meta_schedule.cooperative_fetch", ann_val=v57)""".split( - "\n" - ) - ] - - _test_multi_level_tiling_dense_dp4a(m, n, k, "int8", "int32", expected) - - -def test_multi_level_tiling_dense_dp4a_non_tensorizable(): - _test_multi_level_tiling_dense_dp4a(128, 128, 128, "float32", "float32", [""]) - _test_multi_level_tiling_dense_dp4a(127, 127, 127, "int8", "int32", [""]) - - -def test_cuda_tensor_core_matmul_relu(): - m = n = k = 128 - target = Target("cuda", host="llvm") - ctx = _create_context( - create_prim_func( - te_workload.matmul_relu( - n=n, - m=m, - k=k, - in_dtype="float16", - out_dtype="float32", - ) - ), - target=target, - rule=[ - multi_level_tiling_tensor_core(target=target, write_reuse_scope="shared"), - auto_inline(target), - ], - ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 1 - - expected = [ - """b0 = sch.get_block(name="C", func_name="main") -b1 = sch.get_block(name="compute", func_name="main") -sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") -b2 = sch.reindex(block=b0, buffer=("write", 0)) -b3 = sch.reindex(block=b0, buffer=("read", 0)) -b4 = sch.reindex(block=b0, buffer=("read", 1)) -sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, )) -sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (k, j, )) -sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, )) -sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, )) -sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, )) -sch.transform_block_layout(block=b4, index_map=lambda i, j, k: (i, j, k, )) -sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, )) -l5, l6, l7 = sch.get_loops(block=b0) -l8, l9 = sch.split(loop=l7, factors=[None, 16], preserve_unit_iters=True) -l10, l11 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True) -l12, l13 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True) -l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0) -sch.reorder(l16, l18, l13, l11, l9) -b20 = sch.blockize(loop=l13) -sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32") -sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32") -sch.annotate(block_or_loop=b20, ann_key="warp_execution", ann_val=1) -l21, l22, l23 = sch.get_loops(block=b20) -v24, v25, v26, v27, v28 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4) -l29, l30, l31, l32, l33 = sch.split(loop=l21, factors=[v24, v25, v26, v27, v28], preserve_unit_iters=True) -v34, v35, v36, v37, v38 = sch.sample_perfect_tile(loop=l22, n=5, max_innermost_factor=4) -l39, l40, l41, l42, l43 = sch.split(loop=l22, factors=[v34, v35, v36, v37, v38], preserve_unit_iters=True) -v44, v45, v46 = sch.sample_perfect_tile(loop=l23, n=3, max_innermost_factor=4) -l47, l48, l49 = sch.split(loop=l23, factors=[v44, v45, v46], preserve_unit_iters=True) -sch.reorder(l29, l39, l30, l40, l31, l41, l47, l48, l32, l42, l49, l33, l43) -l50 = sch.fuse(l29, l39, preserve_unit_iters=True) -sch.bind(loop=l50, thread_axis="blockIdx.y") -l51 = sch.fuse(l30, l40, preserve_unit_iters=True) -sch.bind(loop=l51, thread_axis="blockIdx.x") -l52 = sch.fuse(l31, l41, preserve_unit_iters=True) -sch.bind(loop=l52, thread_axis="threadIdx.y") -b53 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="shared") -sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True, index=-1) -b54 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="wmma.accumulator") -sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True, index=-1) -v55 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b53, ann_key="meta_schedule.cooperative_fetch", ann_val=v55) -sch.reverse_compute_inline(block=b2) -l56, l57, l58, l59, l60 = sch.get_loops(block=b54) -l61, l62 = sch.split(loop=l60, factors=[None, 16], preserve_unit_iters=True) -l63, l64 = sch.split(loop=l59, factors=[None, 16], preserve_unit_iters=True) -l65, l66, l67, l68, l69, l70, l71 = sch.get_loops(block=b54) -sch.reorder(l70, l64, l62) -b72 = sch.blockize(loop=l64) -sch.annotate(block_or_loop=b72, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared") -b73 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="shared") -sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True, index=-1) -l74, l75, l76, l77, l78, l79 = sch.get_loops(block=b73) -l80 = sch.fuse(l78, l79, preserve_unit_iters=True) -v81 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v81) -b82 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="shared") -sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True, index=-1) -l83, l84, l85, l86, l87, l88 = sch.get_loops(block=b82) -l89 = sch.fuse(l87, l88, preserve_unit_iters=True) -v90 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b82, ann_key="meta_schedule.cooperative_fetch", ann_val=v90) -b91 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="wmma.matrix_a") -sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True, index=-1) -l92, l93, l94, l95, l96, l97, l98 = sch.get_loops(block=b91) -l99, l100 = sch.split(loop=l98, factors=[None, 16], preserve_unit_iters=True) -l101, l102 = sch.split(loop=l97, factors=[None, 16], preserve_unit_iters=True) -l103, l104, l105, l106, l107, l108, l109, l110, l111 = sch.get_loops(block=b91) -sch.reorder(l110, l102, l100) -b112 = sch.blockize(loop=l102) -sch.annotate(block_or_loop=b112, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a") -b113 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="wmma.matrix_b") -sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True, index=-1) -l114, l115, l116, l117, l118, l119, l120 = sch.get_loops(block=b113) -l121, l122 = sch.split(loop=l120, factors=[None, 16], preserve_unit_iters=True) -l123, l124 = sch.split(loop=l119, factors=[None, 16], preserve_unit_iters=True) -l125, l126, l127, l128, l129, l130, l131, l132, l133 = sch.get_loops(block=b113) -sch.reorder(l132, l124, l122) -b134 = sch.blockize(loop=l124) -sch.annotate(block_or_loop=b134, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b") -sch.compute_inline(block=b3) -sch.compute_inline(block=b4) -sch.storage_align(block=b73, buffer_index=0, axis=-2, factor=32, offset=8) -sch.storage_align(block=b82, buffer_index=0, axis=-2, factor=32, offset=8) -sch.reverse_compute_inline(block=b1)""".split( - "\n" - ) - ] - check_trace(spaces, expected) - - # test multi_level_tiling_tensor_core and multi_level_tiling can be used together in order - # to use multi_level_tiling as a fallback when the workload can't be tensorized - ctx = _create_context( - create_prim_func( - te_workload.matmul_relu( - n=n, - m=m, - k=k, - in_dtype="float16", - out_dtype="float32", - ) - ), - target=target, - rule=[ - multi_level_tiling_tensor_core(target=target, write_reuse_scope="shared"), - multi_level_tiling(target=target), - auto_inline(target), - ], - ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 1 - check_trace(spaces, expected) - - -def test_cuda_tensor_core_software_pipeline_matmul_relu(): - m = n = k = 128 - target = Target("cuda", host="llvm") - ctx = _create_context( - create_prim_func( - te_workload.matmul_relu( - n=n, - m=m, - k=k, - in_dtype="float16", - out_dtype="float32", - ) - ), - target=target, - rule=[ - multi_level_tiling_tensor_core( - target=target, write_reuse_scope="shared", use_software_pipeline=True - ), - auto_inline(target), - ], - ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 1 - - expected = [ - """b0 = sch.get_block(name="C", func_name="main") -b1 = sch.get_block(name="compute", func_name="main") -sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") -b2 = sch.reindex(block=b0, buffer=("write", 0)) -b3 = sch.reindex(block=b0, buffer=("read", 0)) -b4 = sch.reindex(block=b0, buffer=("read", 1)) -sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, )) -sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (k, j, )) -sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, )) -sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, )) -sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, )) -sch.transform_block_layout(block=b4, index_map=lambda i, j, k: (i, j, k, )) -sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, )) -l5, l6, l7 = sch.get_loops(block=b0) -l8, l9 = sch.split(loop=l7, factors=[None, 16], preserve_unit_iters=True) -l10, l11 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True) -l12, l13 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True) -l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0) -sch.reorder(l16, l18, l13, l11, l9) -b20 = sch.blockize(loop=l13) -sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32") -sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32") -sch.annotate(block_or_loop=b20, ann_key="warp_execution", ann_val=1) -l21, l22, l23 = sch.get_loops(block=b20) -v24, v25, v26, v27, v28 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4) -l29, l30, l31, l32, l33 = sch.split(loop=l21, factors=[v24, v25, v26, v27, v28], preserve_unit_iters=True) -v34, v35, v36, v37, v38 = sch.sample_perfect_tile(loop=l22, n=5, max_innermost_factor=4) -l39, l40, l41, l42, l43 = sch.split(loop=l22, factors=[v34, v35, v36, v37, v38], preserve_unit_iters=True) -v44, v45, v46 = sch.sample_perfect_tile(loop=l23, n=3, max_innermost_factor=4) -l47, l48, l49 = sch.split(loop=l23, factors=[v44, v45, v46], preserve_unit_iters=True) -sch.reorder(l29, l39, l30, l40, l31, l41, l47, l48, l32, l42, l49, l33, l43) -l50 = sch.fuse(l29, l39, preserve_unit_iters=True) -sch.bind(loop=l50, thread_axis="blockIdx.y") -l51 = sch.fuse(l30, l40, preserve_unit_iters=True) -sch.bind(loop=l51, thread_axis="blockIdx.x") -l52 = sch.fuse(l31, l41, preserve_unit_iters=True) -sch.bind(loop=l52, thread_axis="threadIdx.y") -b53 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="shared") -sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True, index=-1) -b54 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="wmma.accumulator") -sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True, index=-1) -v55 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b53, ann_key="meta_schedule.cooperative_fetch", ann_val=v55) -sch.reverse_compute_inline(block=b2) -l56, l57, l58, l59, l60 = sch.get_loops(block=b54) -l61, l62 = sch.split(loop=l60, factors=[None, 16], preserve_unit_iters=True) -l63, l64 = sch.split(loop=l59, factors=[None, 16], preserve_unit_iters=True) -l65, l66, l67, l68, l69, l70, l71 = sch.get_loops(block=b54) -sch.reorder(l70, l64, l62) -b72 = sch.blockize(loop=l64) -sch.annotate(block_or_loop=b72, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared") -b73 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="shared") -sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True, index=-1) -l74, l75, l76, l77, l78, l79 = sch.get_loops(block=b73) -l80 = sch.fuse(l78, l79, preserve_unit_iters=True) -v81 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v81) -b82 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="shared") -sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True, index=-1) -l83, l84, l85, l86, l87, l88 = sch.get_loops(block=b82) -l89 = sch.fuse(l87, l88, preserve_unit_iters=True) -v90 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b82, ann_key="meta_schedule.cooperative_fetch", ann_val=v90) -b91 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="wmma.matrix_a") -sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True, index=-1) -l92, l93, l94, l95, l96, l97, l98 = sch.get_loops(block=b91) -l99, l100 = sch.split(loop=l98, factors=[None, 16], preserve_unit_iters=True) -l101, l102 = sch.split(loop=l97, factors=[None, 16], preserve_unit_iters=True) -l103, l104, l105, l106, l107, l108, l109, l110, l111 = sch.get_loops(block=b91) -sch.reorder(l110, l102, l100) -b112 = sch.blockize(loop=l102) -sch.annotate(block_or_loop=b112, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a") -b113 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="wmma.matrix_b") -sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True, index=-1) -l114, l115, l116, l117, l118, l119, l120 = sch.get_loops(block=b113) -l121, l122 = sch.split(loop=l120, factors=[None, 16], preserve_unit_iters=True) -l123, l124 = sch.split(loop=l119, factors=[None, 16], preserve_unit_iters=True) -l125, l126, l127, l128, l129, l130, l131, l132, l133 = sch.get_loops(block=b113) -sch.reorder(l132, l124, l122) -b134 = sch.blockize(loop=l124) -sch.annotate(block_or_loop=b134, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b") -sch.compute_inline(block=b3) -sch.compute_inline(block=b4) -sch.storage_align(block=b73, buffer_index=0, axis=-2, factor=32, offset=8) -sch.storage_align(block=b82, buffer_index=0, axis=-2, factor=32, offset=8) -sch.annotate(block_or_loop=b73, ann_key="tir.manifest_shared_memory_local_stage", ann_val=1) -sch.annotate(block_or_loop=b73, ann_key="double_buffer_scope", ann_val=0) -sch.annotate(block_or_loop=b82, ann_key="tir.manifest_shared_memory_local_stage", ann_val=1) -sch.annotate(block_or_loop=b82, ann_key="double_buffer_scope", ann_val=0) -sch.annotate(block_or_loop=l48, ann_key="software_pipeline_stage", ann_val=[0, 0, 1]) -sch.annotate(block_or_loop=l48, ann_key="software_pipeline_order", ann_val=[0, 1, 2]) -sch.annotate(block_or_loop=l47, ann_key="software_pipeline_stage", ann_val=[0, 0, 0, 0, 0, 1, 1]) -sch.annotate(block_or_loop=l47, ann_key="software_pipeline_order", ann_val=[0, 3, 1, 4, 5, 2, 6]) -sch.reverse_compute_inline(block=b1)""".split( - "\n" - ) - ] - check_trace(spaces, expected) - - -def test_cuda_tensor_core_matmul_relu_global(): - m = n = k = 128 - target = Target("cuda", host="llvm") - workload = create_prim_func( - te_workload.matmul_relu( - n=n, - m=m, - k=k, - in_dtype="float16", - out_dtype="float32", - ), - ) - ctx = _create_context( - workload, - target=target, - rule=[ - multi_level_tiling_tensor_core(target=target, write_reuse_scope="global"), - auto_inline(target), - ], - ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 1 - - expected = [ - """b0 = sch.get_block(name="C", func_name="main") -sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") -b1 = sch.reindex(block=b0, buffer=("write", 0)) -b2 = sch.reindex(block=b0, buffer=("read", 0)) -b3 = sch.reindex(block=b0, buffer=("read", 1)) -sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, )) -sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (k, j, )) -sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, )) -sch.transform_block_layout(block=b1, index_map=lambda i, j, k: (i, j, k, )) -sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, )) -sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, )) -sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, )) -l4, l5, l6 = sch.get_loops(block=b0) -l7, l8 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True) -l9, l10 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True) -l11, l12 = sch.split(loop=l4, factors=[None, 16], preserve_unit_iters=True) -l13, l14, l15, l16, l17, l18 = sch.get_loops(block=b0) -sch.reorder(l15, l17, l12, l10, l8) -b19 = sch.blockize(loop=l12) -sch.annotate(block_or_loop=b19, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32") -sch.annotate(block_or_loop=b19, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32") -sch.annotate(block_or_loop=b19, ann_key="warp_execution", ann_val=1) -l20, l21, l22 = sch.get_loops(block=b19) -v23, v24, v25, v26, v27 = sch.sample_perfect_tile(loop=l20, n=5, max_innermost_factor=4) -l28, l29, l30, l31, l32 = sch.split(loop=l20, factors=[v23, v24, v25, v26, v27], preserve_unit_iters=True) -v33, v34, v35, v36, v37 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4) -l38, l39, l40, l41, l42 = sch.split(loop=l21, factors=[v33, v34, v35, v36, v37], preserve_unit_iters=True) -v43, v44, v45 = sch.sample_perfect_tile(loop=l22, n=3, max_innermost_factor=4) -l46, l47, l48 = sch.split(loop=l22, factors=[v43, v44, v45], preserve_unit_iters=True) -sch.reorder(l28, l38, l29, l39, l30, l40, l46, l47, l31, l41, l48, l32, l42) -l49 = sch.fuse(l28, l38, preserve_unit_iters=True) -sch.bind(loop=l49, thread_axis="blockIdx.y") -l50 = sch.fuse(l29, l39, preserve_unit_iters=True) -sch.bind(loop=l50, thread_axis="blockIdx.x") -l51 = sch.fuse(l30, l40, preserve_unit_iters=True) -sch.bind(loop=l51, thread_axis="threadIdx.y") -b52 = sch.cache_write(block=b19, write_buffer_index=0, storage_scope="wmma.accumulator") -sch.reverse_compute_at(block=b52, loop=l51, preserve_unit_loops=True, index=-1) -sch.reverse_compute_inline(block=b1) -l53, l54, l55, l56, l57 = sch.get_loops(block=b52) -l58, l59 = sch.split(loop=l57, factors=[None, 16], preserve_unit_iters=True) -l60, l61 = sch.split(loop=l56, factors=[None, 16], preserve_unit_iters=True) -l62, l63, l64, l65, l66, l67, l68 = sch.get_loops(block=b52) -sch.reorder(l67, l61, l59) -b69 = sch.blockize(loop=l61) -sch.annotate(block_or_loop=b69, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_global") -b70 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="shared") -sch.compute_at(block=b70, loop=l46, preserve_unit_loops=True, index=-1) -l71, l72, l73, l74, l75, l76 = sch.get_loops(block=b70) -l77 = sch.fuse(l75, l76, preserve_unit_iters=True) -v78 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b70, ann_key="meta_schedule.cooperative_fetch", ann_val=v78) -b79 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="shared") -sch.compute_at(block=b79, loop=l46, preserve_unit_loops=True, index=-1) -l80, l81, l82, l83, l84, l85 = sch.get_loops(block=b79) -l86 = sch.fuse(l84, l85, preserve_unit_iters=True) -v87 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b79, ann_key="meta_schedule.cooperative_fetch", ann_val=v87) -b88 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="wmma.matrix_a") -sch.compute_at(block=b88, loop=l47, preserve_unit_loops=True, index=-1) -l89, l90, l91, l92, l93, l94, l95 = sch.get_loops(block=b88) -l96, l97 = sch.split(loop=l95, factors=[None, 16], preserve_unit_iters=True) -l98, l99 = sch.split(loop=l94, factors=[None, 16], preserve_unit_iters=True) -l100, l101, l102, l103, l104, l105, l106, l107, l108 = sch.get_loops(block=b88) -sch.reorder(l107, l99, l97) -b109 = sch.blockize(loop=l99) -sch.annotate(block_or_loop=b109, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a") -b110 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="wmma.matrix_b") -sch.compute_at(block=b110, loop=l47, preserve_unit_loops=True, index=-1) -l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b110) -l118, l119 = sch.split(loop=l117, factors=[None, 16], preserve_unit_iters=True) -l120, l121 = sch.split(loop=l116, factors=[None, 16], preserve_unit_iters=True) -l122, l123, l124, l125, l126, l127, l128, l129, l130 = sch.get_loops(block=b110) -sch.reorder(l129, l121, l119) -b131 = sch.blockize(loop=l121) -sch.annotate(block_or_loop=b131, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b") -sch.compute_inline(block=b2) -sch.compute_inline(block=b3) -sch.storage_align(block=b70, buffer_index=0, axis=-2, factor=32, offset=8) -sch.storage_align(block=b79, buffer_index=0, axis=-2, factor=32, offset=8)""".split( - "\n" - ) - ] - check_trace(spaces, expected) - - ctx = _create_context( - workload, - target=target, - rule=[ - multi_level_tiling_tensor_core( - target=target, write_reuse_scope="global", trans_b=[False, True] - ), - auto_inline(target), - ], - ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 2 - - expected = [ - expected[0], - """b0 = sch.get_block(name="C", func_name="main") -sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") -b1 = sch.reindex(block=b0, buffer=("write", 0)) -b2 = sch.reindex(block=b0, buffer=("read", 0)) -b3 = sch.reindex(block=b0, buffer=("read", 1)) -sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, )) -sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (j, k, )) -sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, )) -sch.transform_block_layout(block=b1, index_map=lambda i, j, k: (i, j, k, )) -sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, )) -sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, )) -sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, )) -l4, l5, l6 = sch.get_loops(block=b0) -l7, l8 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True) -l9, l10 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True) -l11, l12 = sch.split(loop=l4, factors=[None, 16], preserve_unit_iters=True) -l13, l14, l15, l16, l17, l18 = sch.get_loops(block=b0) -sch.reorder(l15, l17, l12, l10, l8) -b19 = sch.blockize(loop=l12) -sch.annotate(block_or_loop=b19, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32_trans") -sch.annotate(block_or_loop=b19, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32") -sch.annotate(block_or_loop=b19, ann_key="warp_execution", ann_val=1) -l20, l21, l22 = sch.get_loops(block=b19) -v23, v24, v25, v26, v27 = sch.sample_perfect_tile(loop=l20, n=5, max_innermost_factor=4) -l28, l29, l30, l31, l32 = sch.split(loop=l20, factors=[v23, v24, v25, v26, v27], preserve_unit_iters=True) -v33, v34, v35, v36, v37 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4) -l38, l39, l40, l41, l42 = sch.split(loop=l21, factors=[v33, v34, v35, v36, v37], preserve_unit_iters=True) -v43, v44, v45 = sch.sample_perfect_tile(loop=l22, n=3, max_innermost_factor=4) -l46, l47, l48 = sch.split(loop=l22, factors=[v43, v44, v45], preserve_unit_iters=True) -sch.reorder(l28, l38, l29, l39, l30, l40, l46, l47, l31, l41, l48, l32, l42) -l49 = sch.fuse(l28, l38, preserve_unit_iters=True) -sch.bind(loop=l49, thread_axis="blockIdx.y") -l50 = sch.fuse(l29, l39, preserve_unit_iters=True) -sch.bind(loop=l50, thread_axis="blockIdx.x") -l51 = sch.fuse(l30, l40, preserve_unit_iters=True) -sch.bind(loop=l51, thread_axis="threadIdx.y") -b52 = sch.cache_write(block=b19, write_buffer_index=0, storage_scope="wmma.accumulator") -sch.reverse_compute_at(block=b52, loop=l51, preserve_unit_loops=True, index=-1) -sch.reverse_compute_inline(block=b1) -l53, l54, l55, l56, l57 = sch.get_loops(block=b52) -l58, l59 = sch.split(loop=l57, factors=[None, 16], preserve_unit_iters=True) -l60, l61 = sch.split(loop=l56, factors=[None, 16], preserve_unit_iters=True) -l62, l63, l64, l65, l66, l67, l68 = sch.get_loops(block=b52) -sch.reorder(l67, l61, l59) -b69 = sch.blockize(loop=l61) -sch.annotate(block_or_loop=b69, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_global") -b70 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="shared") -sch.compute_at(block=b70, loop=l46, preserve_unit_loops=True, index=-1) -l71, l72, l73, l74, l75, l76 = sch.get_loops(block=b70) -l77 = sch.fuse(l75, l76, preserve_unit_iters=True) -v78 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b70, ann_key="meta_schedule.cooperative_fetch", ann_val=v78) -b79 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="shared") -sch.compute_at(block=b79, loop=l46, preserve_unit_loops=True, index=-1) -l80, l81, l82, l83, l84, l85 = sch.get_loops(block=b79) -l86 = sch.fuse(l84, l85, preserve_unit_iters=True) -v87 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b79, ann_key="meta_schedule.cooperative_fetch", ann_val=v87) -b88 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="wmma.matrix_a") -sch.compute_at(block=b88, loop=l47, preserve_unit_loops=True, index=-1) -l89, l90, l91, l92, l93, l94, l95 = sch.get_loops(block=b88) -l96, l97 = sch.split(loop=l95, factors=[None, 16], preserve_unit_iters=True) -l98, l99 = sch.split(loop=l94, factors=[None, 16], preserve_unit_iters=True) -l100, l101, l102, l103, l104, l105, l106, l107, l108 = sch.get_loops(block=b88) -sch.reorder(l107, l99, l97) -b109 = sch.blockize(loop=l99) -sch.annotate(block_or_loop=b109, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a") -b110 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="wmma.matrix_b") -sch.compute_at(block=b110, loop=l47, preserve_unit_loops=True, index=-1) -l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b110) -l118, l119 = sch.split(loop=l117, factors=[None, 16], preserve_unit_iters=True) -l120, l121 = sch.split(loop=l116, factors=[None, 16], preserve_unit_iters=True) -l122, l123, l124, l125, l126, l127, l128, l129, l130 = sch.get_loops(block=b110) -sch.reorder(l129, l121, l119) -b131 = sch.blockize(loop=l121) -sch.annotate(block_or_loop=b131, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b_trans") -sch.compute_inline(block=b2) -sch.compute_inline(block=b3) -sch.storage_align(block=b70, buffer_index=0, axis=-2, factor=32, offset=8) -sch.storage_align(block=b79, buffer_index=0, axis=-2, factor=32, offset=8)""".split( - "\n" - ), - ] - check_trace(spaces, expected) - - -def test_multi_level_tiling_non_tensorizable(): - # expected to do nothing on non-tensorizable workloads - m = n = k = 128 - target = Target("cuda", host="llvm") - ctx = _create_context( - create_prim_func( - # dtype doesn't match tensor intrin - te_workload.matmul_relu( - n=n, - m=m, - k=k, - ) - ), - target=target, - rule=multi_level_tiling_tensor_core(target=target, write_reuse_scope="global"), - ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 1 - - expected = [ - "", # expected to do nothing when the workload can't be tensorized - ] - check_trace(spaces, expected) - - -def test_cuda_tensor_core_conv2d(): - target = Target("cuda", host="llvm") - workload = create_prim_func( - te_workload.conv2d_nhwc( - N=1, - H=16, - W=16, - CI=32, - CO=32, - kernel_size=3, - stride=1, - padding=1, - in_dtype="float16", - out_dtype="float32", - ) - ) - ctx = _create_context( - workload, - target=target, - rule=multi_level_tiling_tensor_core(target=target, write_reuse_scope="shared"), - ) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 1 - - expected = [ - """b0 = sch.get_block(name="conv2d_nhwc", func_name="main") -sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS") -b1 = sch.reindex(block=b0, buffer=("write", 0)) -b2 = sch.reindex(block=b0, buffer=("read", 0)) -b3 = sch.reindex(block=b0, buffer=("read", 1)) -sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda h, w, rh, rw, rc: (((h*16) + w), (((rh*96) + (rw*32)) + rc), )) -sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda co, rh, rw, rc: ((((rh*96) + (rw*32)) + rc), co, )) -sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda h, w, co: (((h*16) + w), co, )) -sch.transform_block_layout(block=b1, index_map=lambda n, h, w, co, rh, rw, rc: (n, ((h*16) + w), co, (((rh*96) + (rw*32)) + rc), )) -sch.transform_block_layout(block=b2, index_map=lambda n, h, w, co, rh, rw, rc: (n, ((h*16) + w), co, (((rh*96) + (rw*32)) + rc), )) -sch.transform_block_layout(block=b3, index_map=lambda n, h, w, co, rh, rw, rc: (n, ((h*16) + w), co, (((rh*96) + (rw*32)) + rc), )) -sch.transform_block_layout(block=b0, index_map=lambda n, h, w, co, rh, rw, rc: (n, ((h*16) + w), co, (((rh*96) + (rw*32)) + rc), )) -l4, l5, l6, l7 = sch.get_loops(block=b0) -l8, l9 = sch.split(loop=l7, factors=[None, 16], preserve_unit_iters=True) -l10, l11 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True) -l12, l13 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True) -l14, l15, l16, l17, l18, l19, l20 = sch.get_loops(block=b0) -sch.reorder(l17, l19, l13, l11, l9) -b21 = sch.blockize(loop=l13) -sch.annotate(block_or_loop=b21, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32") -sch.annotate(block_or_loop=b21, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32") -sch.annotate(block_or_loop=b21, ann_key="warp_execution", ann_val=1) -l22, l23, l24, l25 = sch.get_loops(block=b21) -v26, v27, v28, v29, v30 = sch.sample_perfect_tile(loop=l22, n=5, max_innermost_factor=4) -l31, l32, l33, l34, l35 = sch.split(loop=l22, factors=[v26, v27, v28, v29, v30], preserve_unit_iters=True) -v36, v37, v38, v39, v40 = sch.sample_perfect_tile(loop=l23, n=5, max_innermost_factor=4) -l41, l42, l43, l44, l45 = sch.split(loop=l23, factors=[v36, v37, v38, v39, v40], preserve_unit_iters=True) -v46, v47, v48, v49, v50 = sch.sample_perfect_tile(loop=l24, n=5, max_innermost_factor=4) -l51, l52, l53, l54, l55 = sch.split(loop=l24, factors=[v46, v47, v48, v49, v50], preserve_unit_iters=True) -v56, v57, v58 = sch.sample_perfect_tile(loop=l25, n=3, max_innermost_factor=4) -l59, l60, l61 = sch.split(loop=l25, factors=[v56, v57, v58], preserve_unit_iters=True) -sch.reorder(l31, l41, l51, l32, l42, l52, l33, l43, l53, l59, l60, l34, l44, l54, l61, l35, l45, l55) -l62 = sch.fuse(l31, l41, l51, preserve_unit_iters=True) -sch.bind(loop=l62, thread_axis="blockIdx.y") -l63 = sch.fuse(l32, l42, l52, preserve_unit_iters=True) -sch.bind(loop=l63, thread_axis="blockIdx.x") -l64 = sch.fuse(l33, l43, l53, preserve_unit_iters=True) -sch.bind(loop=l64, thread_axis="threadIdx.y") -b65 = sch.cache_write(block=b21, write_buffer_index=0, storage_scope="shared") -sch.reverse_compute_at(block=b65, loop=l63, preserve_unit_loops=True, index=-1) -b66 = sch.cache_write(block=b21, write_buffer_index=0, storage_scope="wmma.accumulator") -sch.reverse_compute_at(block=b66, loop=l64, preserve_unit_loops=True, index=-1) -v67 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b65, ann_key="meta_schedule.cooperative_fetch", ann_val=v67) -sch.reverse_compute_inline(block=b1) -l68, l69, l70, l71, l72 = sch.get_loops(block=b66) -l73, l74 = sch.split(loop=l72, factors=[None, 16], preserve_unit_iters=True) -l75, l76 = sch.split(loop=l71, factors=[None, 16], preserve_unit_iters=True) -l77, l78, l79, l80, l81, l82, l83 = sch.get_loops(block=b66) -sch.reorder(l82, l76, l74) -b84 = sch.blockize(loop=l76) -sch.annotate(block_or_loop=b84, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared") -b85 = sch.cache_read(block=b21, read_buffer_index=0, storage_scope="shared") -sch.compute_at(block=b85, loop=l59, preserve_unit_loops=True, index=-1) -l86, l87, l88, l89, l90, l91 = sch.get_loops(block=b85) -l92 = sch.fuse(l90, l91, preserve_unit_iters=True) -v93 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b85, ann_key="meta_schedule.cooperative_fetch", ann_val=v93) -b94 = sch.cache_read(block=b21, read_buffer_index=1, storage_scope="shared") -sch.compute_at(block=b94, loop=l59, preserve_unit_loops=True, index=-1) -l95, l96, l97, l98, l99, l100 = sch.get_loops(block=b94) -l101 = sch.fuse(l99, l100, preserve_unit_iters=True) -v102 = sch.sample_categorical(candidates=[1, 2, 4, 8], probs=[0.25, 0.25, 0.25, 0.25]) -sch.annotate(block_or_loop=b94, ann_key="meta_schedule.cooperative_fetch", ann_val=v102) -b103 = sch.cache_read(block=b21, read_buffer_index=0, storage_scope="wmma.matrix_a") -sch.compute_at(block=b103, loop=l60, preserve_unit_loops=True, index=-1) -l104, l105, l106, l107, l108, l109, l110 = sch.get_loops(block=b103) -l111, l112 = sch.split(loop=l110, factors=[None, 16], preserve_unit_iters=True) -l113, l114 = sch.split(loop=l109, factors=[None, 16], preserve_unit_iters=True) -l115, l116, l117, l118, l119, l120, l121, l122, l123 = sch.get_loops(block=b103) -sch.reorder(l122, l114, l112) -b124 = sch.blockize(loop=l114) -sch.annotate(block_or_loop=b124, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a") -b125 = sch.cache_read(block=b21, read_buffer_index=1, storage_scope="wmma.matrix_b") -sch.compute_at(block=b125, loop=l60, preserve_unit_loops=True, index=-1) -l126, l127, l128, l129, l130, l131, l132 = sch.get_loops(block=b125) -l133, l134 = sch.split(loop=l132, factors=[None, 16], preserve_unit_iters=True) -l135, l136 = sch.split(loop=l131, factors=[None, 16], preserve_unit_iters=True) -l137, l138, l139, l140, l141, l142, l143, l144, l145 = sch.get_loops(block=b125) -sch.reorder(l144, l136, l134) -b146 = sch.blockize(loop=l136) -sch.annotate(block_or_loop=b146, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b") -sch.compute_inline(block=b2) -sch.compute_inline(block=b3) -sch.storage_align(block=b85, buffer_index=0, axis=-2, factor=32, offset=8) -sch.storage_align(block=b94, buffer_index=0, axis=-2, factor=32, offset=8)""".split( - "\n" - ) - ] - check_trace(spaces, expected) - - # test adding unappliable tensor intrinsics doesn't change the search space - ctx = _create_context( - workload, - target, - multi_level_tiling_tensor_core( - target=target, - write_reuse_scope="shared", - in_dtype="float16", - out_dtype=["float16", "float32"], - ), - ) - check_trace(spaces, expected) - spaces = ctx.space_generator.generate_design_space(mod=ctx.mod) - assert len(spaces) == 1 - - -if __name__ == "__main__": - tvm.testing.main() From c0d2734056d4d4bfc67a125b4e61194a809f22d5 Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Thu, 15 Sep 2022 23:29:17 -0700 Subject: [PATCH 184/704] [TVMScript] IRBuilder methods for `Axis` (#12808) This PR introduces remaining IRBuilder methods for `Axis`. Co-authored-by: yongwww --- include/tvm/script/ir_builder/tir/ir.h | 49 ++++++ python/tvm/script/ir_builder/tir/ir.py | 157 +++++++++++++++++- src/script/ir_builder/tir/ir.cc | 86 ++++++++++ .../unittest/test_tvmscript_ir_builder_tir.py | 43 +++++ 4 files changed, 334 insertions(+), 1 deletion(-) diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h index 68948196ff6b..037606253adc 100644 --- a/include/tvm/script/ir_builder/tir/ir.h +++ b/include/tvm/script/ir_builder/tir/ir.h @@ -141,6 +141,55 @@ void PreflattenedBuffer(Buffer postflattened_buffer, Array shape, */ BlockFrame Block(String name, bool no_realize = false); +namespace axis { + +/*! + * \brief The spatial block axis defining function. + * \param dom The domain of the iteration variable. + * \param binding The binding value of the iteration variable. + * \param dtype The data type of the iteration variable. + * \return The iteration variable. + */ +Var Spatial(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32)); + +/*! + * \brief The reduced block axis defining function. + * \param dom The domain of the iteration variable. + * \param binding The binding value of the iteration variable. + * \param dtype The data type of the iteration variable. + * \return The iteration variable. + */ +Var Reduce(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32)); + +/*! + * \brief The scanning block axis defining function. + * \param dom The domain of the iteration variable. + * \param binding The binding value of the iteration variable. + * \param dtype The data type of the iteration variable. + * \return The iteration variable. + */ +Var Scan(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32)); + +/*! + * \brief The opaque block axis defining function. + * \param dom The domain of the iteration variable. + * \param binding The binding value of the iteration variable. + * \param dtype The data type of the iteration variable. + * \return The iteration variable. + */ +Var Opaque(Range dom, PrimExpr binding, DataType dtype = DataType::Int(32)); + +/*! + * \brief The block axis remapping function. + * \param kinds The types of the iteration variables. + * \param bindings The binding values of the iteration variables. + * \param dtype The data types of the iteration variables. + * \return The iteration variables. + */ +Array Remap(String kinds, Array bindings, DataType dtype = DataType::Int(32)); + +} // namespace axis + /*! * \brief The serial For statement. * \param start The minimum value of iteration. diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py index a5cdf8a3a105..40cd99c744d7 100644 --- a/python/tvm/script/ir_builder/tir/ir.py +++ b/python/tvm/script/ir_builder/tir/ir.py @@ -20,7 +20,7 @@ from numbers import Integral from typing import Any, Dict, List, Optional, Union, Tuple -from tvm.ir import Type +from tvm.ir import Range, Type from tvm.tir import ( Buffer, BufferLoad, @@ -344,6 +344,160 @@ def block(name: str = "", no_realize: bool = False) -> frame.BlockFrame: return _ffi_api.Block(name, no_realize) # pylint: disable=no-member # type: ignore +def _as_range(dom: Union[Range, List[PrimExpr]]) -> Range: + """The range constructor. + + Parameters + ---------- + dom : Union[Range, List[PrimExpr]] + The domain. + + Returns + ------- + res : Range + The Range. + """ + if isinstance(dom, Range): + return dom + if isinstance(dom, (list, tuple)): + return Range(dom[0], dom[1]) + return Range(0, dom) + + +class axis: # pylint: disable=invalid-name + @staticmethod + def spatial( + dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]], binding: PrimExpr, dtype: str = "int32" + ) -> Var: + """The spatial block axis defining function. + + Parameters + ---------- + dom : Union[Range, List[PrimExpr], Tuple[PrimExpr]] + The domain of the iteration variable. + + binding : PrimExpr + The binding value of the iteration variable. + + dtype : str + The data type of the iteration variable. + + Returns + ------- + res : Var + The iteration variable. + """ + return _ffi_api.AxisSpatial( # pylint: disable=no-member # type: ignore + _as_range(dom), binding, dtype + ) + + @staticmethod + def reduce( + dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]], binding: PrimExpr, dtype: str = "int32" + ) -> Var: + """The reduced block axis defining function. + + Parameters + ---------- + dom : Union[Range, List[PrimExpr], Tuple[PrimExpr]] + The domain of the iteration variable. + + binding : PrimExpr + The binding value of the iteration variable. + + dtype : str + The data type of the iteration variable. + + Returns + ------- + res : Var + The iteration variable. + """ + return _ffi_api.AxisReduce( # pylint: disable=no-member # type: ignore + _as_range(dom), binding, dtype + ) + + @staticmethod + def scan( + dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]], binding: PrimExpr, dtype: str = "int32" + ) -> Var: + """The scanning block axis defining function. + + Parameters + ---------- + dom : Union[Range, List[PrimExpr], Tuple[PrimExpr]] + The domain of the iteration variable. + + binding : PrimExpr + The binding value of the iteration variable. + + dtype : str + The data type of the iteration variable. + + Returns + ------- + res : Var + The iteration variable. + """ + return _ffi_api.AxisScan( # pylint: disable=no-member # type: ignore + _as_range(dom), binding, dtype + ) + + @staticmethod + def opaque( + dom: Union[Range, List[PrimExpr], Tuple[PrimExpr]], binding: PrimExpr, dtype: str = "int32" + ) -> Var: + """The opaque block axis defining function. + + Parameters + ---------- + dom : Union[Range, List[PrimExpr], Tuple[PrimExpr]] + The domain of the iteration variable. + + binding : PrimExpr + The binding value of the iteration variable. + + dtype : str + The data type of the iteration variable. + + Returns + ------- + res : Var + The iteration variable. + """ + return _ffi_api.AxisOpaque( # pylint: disable=no-member # type: ignore + _as_range(dom), binding, dtype + ) + + @staticmethod + def remap(kinds: str, bindings: List[PrimExpr], dtype: str = "int32") -> Union[List[Var], Var]: + """The block axis remapping function. + + Parameters + ---------- + kinds : str + The types of the iteration variables. + + bindings : List[PrimExpr] + The binding values of the iteration variables. + + dtype : str + The data types of the iteration variables. + + Returns + ------- + res : Var + The iteration variables. + """ + iter_vars = _ffi_api.AxisRemap( # pylint: disable=no-member # type: ignore + kinds, bindings, dtype + ) + return iter_vars[0] if len(iter_vars) == 1 else iter_vars + + S = spatial # pylint: disable=invalid-name + R = reduce # pylint: disable=invalid-name + + def serial( start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any] = None ) -> frame.ForFrame: @@ -843,6 +997,7 @@ def var(dtype, name="") -> Var: "match_buffer", "preflattened_buffer", "block", + "axis", "serial", "parallel", "vectorized", diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc index 22c7face7084..5013e321728e 100644 --- a/src/script/ir_builder/tir/ir.cc +++ b/src/script/ir_builder/tir/ir.cc @@ -173,6 +173,86 @@ BlockFrame Block(String name, bool no_realize) { return BlockFrame(n); } +namespace axis { + +IterVar PushBlockVar(IterVar iter_var, PrimExpr binding) { + if (Optional opt_frame = IRBuilder::Current()->GetLastFrame()) { + BlockFrame frame = opt_frame.value(); + frame->iter_vars.push_back(iter_var); + frame->iter_values.push_back(binding); + } else { + LOG(FATAL) << "TypeError: The last frame is not BlockFrame"; + } + return iter_var; +} + +#define TVM_TIR_IR_BUILDER_AXIS(Method, Kind, Name) \ + Var Method(Range dom, PrimExpr binding, DataType dtype) { \ + ICHECK(dom.defined()) << Name << " axis must have a domain"; \ + int bits = std::max({dom->min.dtype().bits(), dom->extent.dtype().bits(), dtype.bits()}); \ + return PushBlockVar(IterVar(/*dom=*/dom, /*var=*/Var("", dtype.with_bits(bits)), \ + /*iter_type=*/Kind, /*thread_tag=*/""), \ + binding) \ + ->var; \ + } +TVM_TIR_IR_BUILDER_AXIS(Spatial, tvm::tir::IterVarType::kDataPar, "Spatial"); +TVM_TIR_IR_BUILDER_AXIS(Reduce, tvm::tir::IterVarType::kCommReduce, "Reduction"); +TVM_TIR_IR_BUILDER_AXIS(Scan, tvm::tir::IterVarType::kOrdered, "Scan"); +TVM_TIR_IR_BUILDER_AXIS(Opaque, tvm::tir::IterVarType::kOpaque, "Opaque"); +#undef TVM_TIR_IR_BUILDER_AXIS + +Array Remap(String kinds, Array bindings, DataType dtype) { + using namespace tvm::tir; + Array results; + ICHECK_EQ(kinds.size(), bindings.size()); + int n = bindings.size(); + results.reserve(n); + for (int i = 0; i < n; ++i) { + char c = kinds.c_str()[i]; + PrimExpr e = bindings[i]; + const VarNode* v = e.as(); + ICHECK(v) << "TypeError: Only Var is supported in T.axis.remap"; + Range dom{nullptr}; + for (const auto& frame : IRBuilder::Current()->frames) { + if (const auto* for_frame = frame.as()) { + ICHECK_EQ(for_frame->doms.size(), for_frame->vars.size()); + int n = for_frame->doms.size(); + for (int i = 0; i < n; ++i) { + if (for_frame->vars[i].get() == v) { + dom = for_frame->doms[i]; + break; + } + } + if (dom.defined()) { + break; + } + } + } + ICHECK(dom.defined()) << "TypeError: Variable is not in the loop: " << GetRef(v); + DataType dtype = v->dtype; + if (c == 'S') { + results.push_back(PushBlockVar(IterVar(/*dom=*/dom, + /*var=*/Var("", dtype), + /*iter_type=*/IterVarType::kDataPar, + /*thread_tag=*/""), + e) + ->var); + } else if (c == 'R') { + results.push_back(PushBlockVar(IterVar(/*dom=*/dom, + /*var=*/Var("", dtype), + /*iter_type=*/IterVarType::kCommReduce, + /*thread_tag=*/""), + e) + ->var); + } else { + LOG(FATAL) << "Unknown axis kind: " << c; + } + } + return results; +} + +} // namespace axis + #define TVM_TIR_IR_BUILDER_FOR_FRAME(Method, Kind) \ ForFrame Method(PrimExpr start, PrimExpr stop, Optional> annotations) { \ PrimExpr min = start; \ @@ -304,6 +384,12 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.PreflattenedBuffer").set_body_typed(P TVM_REGISTER_GLOBAL("script.ir_builder.tir.Block").set_body_typed(Block); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.AxisSpatial").set_body_typed(axis::Spatial); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.AxisReduce").set_body_typed(axis::Reduce); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.AxisScan").set_body_typed(axis::Scan); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.AxisOpaque").set_body_typed(axis::Opaque); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.AxisRemap").set_body_typed(axis::Remap); + TVM_REGISTER_GLOBAL("script.ir_builder.tir.Serial").set_body_typed(Serial); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Parallel").set_body_typed(Parallel); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Vectorized").set_body_typed(Vectorized); diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py index 9cbfd75e2280..d893ebc545c6 100644 --- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py +++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py @@ -114,6 +114,49 @@ def test_ir_builder_tir_block(): assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True) +def test_ir_builder_tir_axis(): + with IRBuilder() as ib: + a = T.var("int32", "a") + b = T.var("int32", "b") + c = T.var("int32", "c") + d = T.var("int32", "d") + with T.block("block"): + T.axis.spatial(8, a) + T.axis.reduce(16, b) + T.axis.scan(32, c) + T.axis.opaque(64, d) + T.evaluate(0) + + # the block generated by IRBuilder + block_realize_actual = ib.get() + + # the expected block + var_a = tir.Var("a", "int32") + var_b = tir.Var("b", "int32") + var_c = tir.Var("c", "int32") + var_d = tir.Var("d", "int32") + block_expected = tir.Block( + iter_vars=[ + tir.IterVar((0, 8), tir.Var("", "int32"), iter_type=tir.IterVar.DataPar), + tir.IterVar((0, 16), tir.Var("", "int32"), iter_type=tir.IterVar.CommReduce), + tir.IterVar((0, 32), tir.Var("", "int32"), iter_type=tir.IterVar.Ordered), + tir.IterVar((0, 64), tir.Var("", "int32"), iter_type=tir.IterVar.DimInfo), + ], + reads=[], + writes=[], + name_hint="block", + body=tir.Evaluate(0), + annotations={"tir.script_parsing_detect_access": tir.IntImm("int64", 3)}, + ) + block_realize_expected = tir.BlockRealize( + iter_values=[var_a, var_b, var_c, var_d], + predicate=True, + block=block_expected, + ) + # Check if the generated ir is expected + assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True) + + def test_ir_builder_tir_for(): with IRBuilder() as ib: with T.serial(128) as a: From 9b17f344a31a13226458a2d48dcb7b55ce282274 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Fri, 16 Sep 2022 00:02:18 -0700 Subject: [PATCH 185/704] [ci][docker] Fix nightly Docker tests (#12804) These were broken due to this missing guard: https://ci.tlcpack.ai/job/docker-images-ci/job/docker-image-run-tests/223/console Co-authored-by: driazati --- Jenkinsfile | 4 ++-- ci/jenkins/Prepare.groovy.j2 | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 5835100fde3e..8ca181a759ff 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-09-14T11:22:31.582192 +// Generated at 2022-09-15T16:03:21.407877 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -213,7 +213,7 @@ def cancel_previous_build() { def checkout_trusted_files() { // trust everything from branch builds - if (!env.BRANCH_NAME.startsWith('PR-')) { + if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) { return; } diff --git a/ci/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2 index 4464108968de..cb677f437a3c 100644 --- a/ci/jenkins/Prepare.groovy.j2 +++ b/ci/jenkins/Prepare.groovy.j2 @@ -101,7 +101,7 @@ def cancel_previous_build() { def checkout_trusted_files() { // trust everything from branch builds - if (!env.BRANCH_NAME.startsWith('PR-')) { + if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) { return; } From 6b3be496e6ac2e2de22a59d935e5256e04bc8c74 Mon Sep 17 00:00:00 2001 From: Xiyou Zhou Date: Fri, 16 Sep 2022 06:11:07 -0700 Subject: [PATCH 186/704] [MetaSchedule][Minor]Fix Random State Fork in TuneContext Clone Function (#12811) Fix random state fork in TuneContext Clone function. --- src/meta_schedule/tune_context.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc index 3650c0374dab..ee24624fe9e4 100644 --- a/src/meta_schedule/tune_context.cc +++ b/src/meta_schedule/tune_context.cc @@ -74,6 +74,7 @@ TuneContext TuneContextNode::Clone() const { } if (this->space_generator.defined()) n->space_generator = this->space_generator.value()->Clone(); if (this->search_strategy.defined()) n->search_strategy = this->search_strategy.value()->Clone(); + n->rand_state = support::LinearCongruentialEngine(&n->rand_state).ForkSeed(); n->Initialize(); return TuneContext(n); } From 8f8b6d8837a989a95ea9716b517644f898ef9b7e Mon Sep 17 00:00:00 2001 From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com> Date: Fri, 16 Sep 2022 16:51:09 +0100 Subject: [PATCH 187/704] Fix for import requests and import caffe failures (#12813) Recently virtual environments were introduced in the docker images which was a great contribution to localize errors: https://github.com/apache/tvm/pull/12663. In this fix, link to the caffe is created inside this virtual env instead of adding it to the system path of python. This fix also removes importing request package where not needed. Fixes #12663 --- ci/scripts/github_skipped_tests_comment.py | 2 -- docker/install/ubuntu_install_caffe.sh | 8 +++++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ci/scripts/github_skipped_tests_comment.py b/ci/scripts/github_skipped_tests_comment.py index ef0630620b97..c07fbf4a8463 100755 --- a/ci/scripts/github_skipped_tests_comment.py +++ b/ci/scripts/github_skipped_tests_comment.py @@ -24,8 +24,6 @@ from urllib import error from xml.etree import ElementTree -import requests - from git_utils import git, GitHubRepo, parse_remote from cmd_utils import init_log diff --git a/docker/install/ubuntu_install_caffe.sh b/docker/install/ubuntu_install_caffe.sh index c37bfb764935..4d9763b69aa3 100755 --- a/docker/install/ubuntu_install_caffe.sh +++ b/docker/install/ubuntu_install_caffe.sh @@ -18,6 +18,11 @@ set -euxo pipefail +if [ -z "${TVM_VENV+x}" ]; then + echo "ERROR: expect TVM_VENV env var to be set" + exit 2 +fi + apt-get update --fix-missing # # Install dependencies @@ -60,4 +65,5 @@ cd / && rm -rf /caffe_src PYCAFFE_ROOT=${CAFFE_HOME}/python echo "${CAFFE_HOME}/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig -ln -s ${PYCAFFE_ROOT}/caffe /usr/local/lib/python3.7/dist-packages/caffe +VENV_SITE_PACKAGE=$(pip3 show numpy | grep "Location:" | cut -d ' ' -f 2) +ln -s ${PYCAFFE_ROOT}/caffe ${VENV_SITE_PACKAGE}/caffe From 43d9a3b93baeeec33fa0f4953f50c7242e8183b5 Mon Sep 17 00:00:00 2001 From: Noah Verke Date: Fri, 16 Sep 2022 10:50:30 -0700 Subject: [PATCH 188/704] =?UTF-8?q?[Hexagon]=20Reduce=20the=20number=20of?= =?UTF-8?q?=20tests=20run=20for=20VTCM=20testing=20in=20order=20to?= =?UTF-8?q?=E2=80=A6=20(#12783)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Hexagon] Reduce the number of tests run for VTCM testing in order to speedup CI. --- .../test_parallel_hvx_load_vtcm.py | 25 ++++++++++++---- .../test_hexagon/test_vtcm_bandwidth.py | 30 +++++++++++-------- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py index c9ff07c490c8..5dcb4b18b845 100644 --- a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py +++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py @@ -308,7 +308,14 @@ def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global") a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope=mem_scope) b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device, mem_scope=mem_scope) c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device, mem_scope=mem_scope) - timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10) + + # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise. + number = 1 + repeat = 1 + + timer = module.time_evaluator( + "__tvm_main__", hexagon_session.device, number=number, repeat=repeat + ) time = timer(a_hexagon, b_hexagon, c_hexagon) gops = round(operations * 128 * 3 / time.mean / 1e9, 4) return gops, c_hexagon.asnumpy() @@ -338,7 +345,13 @@ def setup_and_run_preallocated(hexagon_session, sch, a, b, c, operations): c_vtcm, device=hexagon_session.device, mem_scope="global.vtcm" ) - timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10) + # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise. + number = 1 + repeat = 1 + + timer = module.time_evaluator( + "__tvm_main__", hexagon_session.device, number=number, repeat=repeat + ) time = timer(a_hexagon, b_hexagon, c_hexagon, a_vtcm_hexagon, b_vtcm_hexagon, c_vtcm_hexagon) gops = round(operations * 128 * 3 / time.mean / 1e9, 4) return gops, c_hexagon.asnumpy() @@ -372,12 +385,12 @@ def expected_output(operations, input_a, input_b, input_c): class TestMatMulVec: - + # Removed most of these to speedup CI. operations = tvm.testing.parameter( 1024, - 2048, - 4096, - 5 * 2048, # 3.93MB of total transfer + # 2048, + # 4096, + # 5 * 2048, # 3.93MB of total transfer # 16384, #Only works on 8Gen1 HDK's # 5 * 4096, # 7.86MB of total transfer. Only works on 8Gen1 HDK's ) diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py index 6db8b9101997..83daf2458737 100644 --- a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py +++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py @@ -96,10 +96,13 @@ def evaluate(hexagon_session, sch, size): a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm" ) - # a_hexagon = allocate_hexagon_array(hexagon_session.device, data=a, mem_scope="global") - # a_vtcm_hexagon = allocate_hexagon_array(hexagon_session.device, data=a_vtcm, mem_scope="global.vtcm") + # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise. + number = 1 + repeat = 1 - timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10) + timer = module.time_evaluator( + "__tvm_main__", hexagon_session.device, number=number, repeat=repeat + ) runtime = timer(a_hexagon, a_vtcm_hexagon) gbps = round((size / 2**30) / runtime.mean, 4) @@ -110,18 +113,19 @@ def evaluate(hexagon_session, sch, size): class TestMatMulVec: + # Removed most of these to speedup CI. size = tvm.testing.parameter( - 10 * KB, - 20 * KB, - 40 * KB, - 80 * KB, - 160 * KB, - 320 * KB, + # 10 * KB, + # 20 * KB, + # 40 * KB, + # 80 * KB, + # 160 * KB, + # 320 * KB, 640 * KB, - MB, - 2 * MB, - 3 * MB, - 4 * MB, + # MB, + # 2 * MB, + # 3 * MB, + # 4 * MB, # 8 * MB, # Only works on 8gen1 HDKs ) From 7c96e255ce7d6d6a22b3665449ebfafb581a9fc8 Mon Sep 17 00:00:00 2001 From: Janet Schneider Date: Fri, 16 Sep 2022 11:53:53 -0700 Subject: [PATCH 189/704] [Hexagon] [runtime] Protect access to global HexagonBufferManager map (#12807) * Protect access to global buffer manager map * Fix lint --- src/runtime/hexagon/hexagon_buffer_manager.h | 25 +++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/src/runtime/hexagon/hexagon_buffer_manager.h b/src/runtime/hexagon/hexagon_buffer_manager.h index 658a39fac8a8..a698b0ecb163 100644 --- a/src/runtime/hexagon/hexagon_buffer_manager.h +++ b/src/runtime/hexagon/hexagon_buffer_manager.h @@ -43,7 +43,10 @@ class HexagonBufferManager { CHECK(it != hexagon_buffer_map_.end()) << "Attempt made to free unknown or already freed dataspace allocation"; CHECK(it->second != nullptr); - hexagon_buffer_map_.erase(it); + { + std::lock_guard lock(map_mutex_); + hexagon_buffer_map_.erase(it); + } } /*! * \brief Allocate a HexagonBuffer. @@ -53,15 +56,22 @@ class HexagonBufferManager { void* AllocateHexagonBuffer(Args&&... args) { auto buf = std::make_unique(std::forward(args)...); void* ptr = buf->GetPointer(); - hexagon_buffer_map_.insert({ptr, std::move(buf)}); + { + std::lock_guard lock(map_mutex_); + hexagon_buffer_map_.insert({ptr, std::move(buf)}); + } return ptr; } //! \brief Returns whether the HexagonBuffer is in the map. - size_t count(void* ptr) { return hexagon_buffer_map_.count(ptr); } + size_t count(void* ptr) { + std::lock_guard lock(map_mutex_); + return hexagon_buffer_map_.count(ptr); + } //! \brief Returns an iterator to the HexagonBuffer within the map. HexagonBuffer* find(void* ptr) { + std::lock_guard lock(map_mutex_); auto it = hexagon_buffer_map_.find(ptr); if (it != hexagon_buffer_map_.end()) { return it->second.get(); @@ -69,9 +79,18 @@ class HexagonBufferManager { return nullptr; } + //! \brief Returns whether the HexagonBufferManager has any allocations. + bool empty() { + std::lock_guard lock(map_mutex_); + return hexagon_buffer_map_.empty(); + } + private: //! \brief Contains the HexagonBuffer objects managed by this class. std::unordered_map> hexagon_buffer_map_; + + //! \brief Protects updates to the map. + std::mutex map_mutex_; }; } // namespace hexagon From 5d0a16749cdba494178dee7deefa2938d1f8a88b Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Fri, 16 Sep 2022 12:54:51 -0700 Subject: [PATCH 190/704] [ci] Fix docs push (#12810) This was missing a repo checkout and failing as in https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/4302/pipeline. This also adds in the changes from #12719: Fixes #12600. The original solution there doesn't actually fix the issue, there would need to be some job queue that could make sure to reject old pushes. Since this case is pretty rare, generally the next commit that comes along and builds will fix everything up so we can ignore failures that happen on `push`es. --- Jenkinsfile | 7 ++++--- ci/jenkins/Deploy.groovy.j2 | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 8ca181a759ff..a61ab1cd69a2 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-09-15T16:03:21.407877 +// Generated at 2022-09-16T08:47:49.743918 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -4205,7 +4205,7 @@ def deploy_docs() { script: ''' cd tvm-site git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git - git push deploy $DOCS_DEPLOY_BRANCH + git push deploy $DOCS_DEPLOY_BRANCH || true ''', label: 'Upload docs to apache/tvm-site' ) @@ -4222,7 +4222,8 @@ def deploy() { node('CPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docs") { timeout(time: max_time, unit: 'MINUTES') { - sh( + init_git() + sh( script: """ set -eux . ci/scripts/retry.sh diff --git a/ci/jenkins/Deploy.groovy.j2 b/ci/jenkins/Deploy.groovy.j2 index d2ee4360da6b..9812e1113598 100644 --- a/ci/jenkins/Deploy.groovy.j2 +++ b/ci/jenkins/Deploy.groovy.j2 @@ -73,7 +73,7 @@ def deploy_docs() { script: ''' cd tvm-site git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git - git push deploy $DOCS_DEPLOY_BRANCH + git push deploy $DOCS_DEPLOY_BRANCH || true ''', label: 'Upload docs to apache/tvm-site' ) @@ -90,6 +90,7 @@ def deploy() { feature_flag="env.DOCS_DEPLOY_ENABLED == 'yes'", ws="tvm/deploy-docs", ) %} + init_git() {{ m.download_artifacts(tag='docs', filenames=["docs.tgz"]) }} deploy_docs() {% endcall %} From e037ae49928592afdfa8d2c27198fc68592f9528 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Fri, 16 Sep 2022 13:29:57 -0700 Subject: [PATCH 191/704] [ci] Add bot to post welcome comment (#12695) This would post the comment that the tests bot and the docs comment bot uses straightaway when a PR is posted. This will contain links to generic info about posting PRs (and obviate the `.github/PULL_REQUEST_TEMPLATE.md`) as well as dynamic info about the specific PR (filled in later by the respective bots). This would make things like the auto-cc bot more transparent since it would have a link to the relevant issue. Tested live here: https://github.com/driazati/tvm/pull/21#issuecomment-1236019529 --- .github/PULL_REQUEST_TEMPLATE.md | 1 - .github/workflows/docs_bot.yml | 18 -- .github/workflows/pr_comment_bot.yml | 55 +++++ .github/workflows/tag_teams.yml | 7 - .github/workflows/tests_bot.yml | 21 -- ci/scripts/__init__.py | 19 ++ ci/scripts/git_utils.py | 60 ++++- ci/scripts/github_commenter.py | 132 +++++++++++ ci/scripts/github_docs_comment.py | 83 +------ ci/scripts/github_pr_comment.py | 141 +++++++++++ ci/scripts/github_skipped_tests_comment.py | 140 ++++------- ci/scripts/github_tag_teams.py | 85 +++++-- tests/python/ci/test_ci.py | 262 ++++++++++++++++++--- 13 files changed, 741 insertions(+), 283 deletions(-) delete mode 100644 .github/PULL_REQUEST_TEMPLATE.md delete mode 100644 .github/workflows/docs_bot.yml create mode 100644 .github/workflows/pr_comment_bot.yml delete mode 100644 .github/workflows/tests_bot.yml create mode 100644 ci/scripts/__init__.py create mode 100644 ci/scripts/github_commenter.py create mode 100755 ci/scripts/github_pr_comment.py diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md deleted file mode 100644 index 093cdc483c78..000000000000 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ /dev/null @@ -1 +0,0 @@ -Thanks for contributing to TVM! Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread. diff --git a/.github/workflows/docs_bot.yml b/.github/workflows/docs_bot.yml deleted file mode 100644 index 73c12a8d7d05..000000000000 --- a/.github/workflows/docs_bot.yml +++ /dev/null @@ -1,18 +0,0 @@ - -name: docs-bot -on: - status -jobs: - run-docs-bot: - if: ${{ github.repository == 'apache/tvm' && github.event.state == 'success' && github.event.context == 'tvm-ci/pr-head' }} - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Comment link to docs - env: - COMMIT_SHA: ${{ github.event.sha }} - TARGET_URL: ${{ github.event.target_url }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - set -eux - python ci/scripts/github_docs_comment.py \ No newline at end of file diff --git a/.github/workflows/pr_comment_bot.yml b/.github/workflows/pr_comment_bot.yml new file mode 100644 index 000000000000..89416df928b8 --- /dev/null +++ b/.github/workflows/pr_comment_bot.yml @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: comment-bot +on: + pull_request_target: + types: [opened, reopened, edited, ready_for_review, labeled] + status: + +concurrency: + group: pr-comment-${{ github.event.number }}-${{ github.event.target_url }} + cancel-in-progress: true + +jobs: + run-comment-bot: + if: ${{ github.repository == 'apache/tvm' }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Comment bot comment (pr) + if: ${{ github.event.number }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ github.event.number }} + run: | + set -eux + python ci/scripts/github_pr_comment.py --pr "$PR_NUMBER" + - name: Comment bot comment (status) + if: ${{ github.event.state }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + URL: ${{ github.event.target_url }} + run: | + set -eux + if [[ "$URL" == *"PR-"* ]]; then + echo "PR status, sending comment" + PR_NUMBER=$(echo $URL | sed 's/.*PR-//g' | sed 's/\/.*//g') + python ci/scripts/github_pr_comment.py --pr "$PR_NUMBER" + else + echo "Not a PR status, skipping" + fi diff --git a/.github/workflows/tag_teams.yml b/.github/workflows/tag_teams.yml index 7c10f9c33d9f..c0c1b8b8299d 100644 --- a/.github/workflows/tag_teams.yml +++ b/.github/workflows/tag_teams.yml @@ -15,16 +15,9 @@ # specific language governing permissions and limitations # under the License. -# GH actions. -# We use it to cover windows and mac builds -# Jenkins is still the primary CI - name: Teams on: - # See https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request_target - pull_request_target: - types: [opened, reopened, edited, ready_for_review, labeled] issues: types: [opened, edited, reopened, labeled] diff --git a/.github/workflows/tests_bot.yml b/.github/workflows/tests_bot.yml deleted file mode 100644 index 0ddae2afb771..000000000000 --- a/.github/workflows/tests_bot.yml +++ /dev/null @@ -1,21 +0,0 @@ - -name: tests-bot -on: - status -jobs: - run-tests-bot: - if: ${{ github.repository == 'apache/tvm' && github.event.state == 'success' && github.event.context == 'tvm-ci/pr-head' }} - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Comment skipped tests - env: - AWS_ACCESS_KEY_ID: ${{ secrets.CI_RESOURCES_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.CI_RESOURCES_AWS_SECRET_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 - COMMIT_SHA: ${{ github.event.sha }} - TARGET_URL: ${{ github.event.target_url }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - set -eux - python ci/scripts/github_skipped_tests_comment.py \ No newline at end of file diff --git a/ci/scripts/__init__.py b/ci/scripts/__init__.py new file mode 100644 index 000000000000..064781fa158d --- /dev/null +++ b/ci/scripts/__init__.py @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Package to enable testing of CI scripts""" + +from . import github_skipped_tests_comment, github_pr_comment, github_tag_teams, github_docs_comment diff --git a/ci/scripts/git_utils.py b/ci/scripts/git_utils.py index cb639178c3f9..1295ff8e3c2c 100644 --- a/ci/scripts/git_utils.py +++ b/ci/scripts/git_utils.py @@ -19,11 +19,14 @@ import json import subprocess import re +import os import base64 import logging from urllib import request, error from typing import Dict, Tuple, Any, Optional, List +DRY_RUN = object() + def compress_query(query: str) -> str: query = query.replace("\n", "") @@ -32,7 +35,7 @@ def compress_query(query: str) -> str: def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] = None): - print(f"Requesting POST to", url, "with", body) + logging.info(f"Requesting POST to", url, "with", body) headers = {} req = request.Request(url, headers=headers, method="POST") if auth is not None: @@ -51,11 +54,21 @@ def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] = return response.read() +def dry_run_token(is_dry_run: bool) -> Any: + if is_dry_run: + return DRY_RUN + return os.environ["GITHUB_TOKEN"] + + class GitHubRepo: - def __init__(self, user, repo, token): + GRAPHQL_URL = "https://api.github.com/graphql" + + def __init__(self, user, repo, token, test_data=None): self.token = token self.user = user self.repo = repo + self.test_data = test_data + self.num_calls = 0 self.base = f"https://api.github.com/repos/{user}/{repo}/" def headers(self): @@ -63,22 +76,41 @@ def headers(self): "Authorization": f"Bearer {self.token}", } + def dry_run(self) -> bool: + return self.token == DRY_RUN + def graphql(self, query: str, variables: Optional[Dict[str, str]] = None) -> Dict[str, Any]: query = compress_query(query) if variables is None: variables = {} + response = self._request( - "https://api.github.com/graphql", + self.GRAPHQL_URL, {"query": query, "variables": variables}, method="POST", ) + if self.dry_run(): + return self.testing_response("POST", self.GRAPHQL_URL) + if "data" not in response: msg = f"Error fetching data with query:\n{query}\n\nvariables:\n{variables}\n\nerror:\n{json.dumps(response, indent=2)}" raise RuntimeError(msg) return response + def testing_response(self, method: str, url: str) -> Any: + self.num_calls += 1 + key = f"[{self.num_calls}] {method} - {url}" + if self.test_data is not None and key in self.test_data: + return self.test_data[key] + logging.info(f"Unknown URL in dry run: {key}") + return {} + def _request(self, full_url: str, body: Dict[str, Any], method: str) -> Dict[str, Any]: - print(f"Requesting {method} to", full_url, "with", body) + if self.dry_run(): + logging.info(f"Dry run, would have requested a {method} to {full_url} with {body}") + return self.testing_response(method, full_url) + + logging.info(f"Requesting {method} to {full_url} with {body}") req = request.Request(full_url, headers=self.headers(), method=method.upper()) req.add_header("Content-Type", "application/json; charset=utf-8") data = json.dumps(body) @@ -111,16 +143,22 @@ def post(self, url: str, data: Dict[str, Any]) -> Dict[str, Any]: return self._request(self.base + url, data, method="POST") def get(self, url: str) -> Dict[str, Any]: + if self.dry_run(): + logging.info(f"Dry run, would have requested a GET to {url}") + return self.testing_response("GET", url) url = self.base + url - print("Requesting GET to", url) + logging.info(f"Requesting GET to {url}") req = request.Request(url, headers=self.headers()) with request.urlopen(req) as response: response = json.loads(response.read()) return response def delete(self, url: str) -> Dict[str, Any]: + if self.dry_run(): + logging.info(f"Dry run, would have requested a DELETE to {url}") + return self.testing_response("DELETE", url) url = self.base + url - print("Requesting DELETE to", url) + logging.info(f"Requesting DELETE to {url}") req = request.Request(url, headers=self.headers(), method="DELETE") with request.urlopen(req) as response: response = json.loads(response.read()) @@ -136,18 +174,22 @@ def parse_remote(remote: str) -> Tuple[str, str]: parts = remote.split("/") if len(parts) < 2: raise RuntimeError(f"Unable to parse remote '{remote}'") - return parts[-2], parts[-1].replace(".git", "") + user, repo = parts[-2], parts[-1].replace(".git", "") else: # Parse SSH remote m = re.search(r":(.*)/(.*)\.git", remote) if m is None or len(m.groups()) != 2: raise RuntimeError(f"Unable to parse remote '{remote}'") - return m.groups() + user, repo = m.groups() + + user = os.getenv("DEBUG_USER", user) + repo = os.getenv("DEBUG_REPO", repo) + return user, repo def git(command, **kwargs): command = ["git"] + command - print("Running", command) + logging.info(f"Running {command}") proc = subprocess.run(command, stdout=subprocess.PIPE, encoding="utf-8", **kwargs) if proc.returncode != 0: raise RuntimeError(f"Command failed {command}:\nstdout:\n{proc.stdout}") diff --git a/ci/scripts/github_commenter.py b/ci/scripts/github_commenter.py new file mode 100644 index 000000000000..dc71fcd1fd32 --- /dev/null +++ b/ci/scripts/github_commenter.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import re +import logging +from typing import Dict, Tuple, Any, Optional, List, Union + +from git_utils import GitHubRepo + +BOT_COMMENT_START = "" +WELCOME_TEXT = "Thanks for contributing to TVM! Please refer to the contributing guidelines https://tvm.apache.org/docs/contribute/ for useful information and tips. Please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @-ing them in a comment." + + +class BotCommentBuilder: + ALLOWLIST_USERS = {"driazati", "gigiblender", "areusch"} + + def __init__(self, github: GitHubRepo, data: Dict[str, Any]): + self.github = github + self.pr_number = data["number"] + self.comment_data = data["comments"]["nodes"] + self.author = data["author"]["login"] + + def find_bot_comment(self) -> Optional[Dict[str, Any]]: + """ + Return the existing bot comment or None if it does not exist + """ + for comment in self.comment_data: + logging.info(f"Checking comment {comment}") + if ( + comment["author"]["login"] == "github-actions" + and BOT_COMMENT_START in comment["body"] + ): + logging.info("Found existing comment") + return comment + logging.info("No existing comment found") + return None + + def find_existing_body(self) -> Dict[str, str]: + """ + Find existing dynamic bullet point items + """ + existing_comment = self.find_bot_comment() + if existing_comment is None: + logging.info(f"No existing comment while searching for body items") + return {} + + matches = re.findall( + r"([\S\s]*?)", + existing_comment["body"], + flags=re.MULTILINE, + ) + logging.info(f"Fetch body item matches: {matches}") + + items = {} + for start, text, end in matches: + if start != end: + raise RuntimeError( + f"Malformed comment found: {start} marker did not have matching end, found instead {end}" + ) + items[start] = text.strip().lstrip("* ") + + logging.info(f"Found body items: {items}") + return items + + def _post_comment(self, body_items: Dict[str, str]): + comment = BOT_COMMENT_START + "\n\n" + WELCOME_TEXT + "\n\n" + for key, content in body_items.items(): + line = self.start_key(key) + "\n * " + content.strip() + self.end_key(key) + logging.info(f"Adding line {line}") + comment += line + comment += "\n\nGenerated by [tvm-bot](https://github.com/apache/tvm/blob/main/ci/README.md#github-actions)" + + data = {"body": comment} + url = f"issues/{self.pr_number}/comments" + + logging.info(f"Commenting {comment} on {url}") + + if self.author not in self.ALLOWLIST_USERS: + logging.info(f"Skipping comment for author {self.author}") + return + + existing_comment = self.find_bot_comment() + if existing_comment is None: + # Comment does not exist, post it + r = self.github.post(url, data) + else: + # Comment does exist, update it + comment_url = f"issues/comments/{existing_comment['databaseId']}" + r = self.github.patch(comment_url, data) + + logging.info(f"Got response from posting comment: {r}") + + def start_key(self, key: str) -> str: + return f"" + + def end_key(self, key: str) -> str: + return f"" + + def post_items(self, items: List[Tuple[str, str]]): + """ + Update or post bullet points in the PR based on 'items' which is a + list of (key, text) pairs + """ + # Find the existing bullet points + body_items = self.find_existing_body() + + # Add or update the requested items + for key, text in items: + if text is None or text.strip() == "": + logging.info(f"Skipping {key} since it was empty") + continue + logging.info(f"Updating comment items {key} with {text}") + body_items[key] = text.strip() + + # Post or update the comment + # print(body_items) + self._post_comment(body_items=body_items) diff --git a/ci/scripts/github_docs_comment.py b/ci/scripts/github_docs_comment.py index 64377b632c48..0a29dde2038a 100755 --- a/ci/scripts/github_docs_comment.py +++ b/ci/scripts/github_docs_comment.py @@ -16,29 +16,19 @@ # specific language governing permissions and limitations # under the License. -import os -import logging -import argparse -import sys -from urllib import error - -from git_utils import git, GitHubRepo, parse_remote -from cmd_utils import init_log - -DOCS_BOT_MARKER = "\n\n" -GITHUB_ACTIONS_BOT_LOGIN = "github-actions[bot]" +from typing import Dict, Any def build_docs_url(base_url_docs, pr_number, build_number): return f"{base_url_docs}/PR-{str(pr_number)}/{str(build_number)}/docs/index.html" -def get_pr_comments(github, url): - try: - return github.get(url) - except error.HTTPError as e: - logging.exception(f"Failed to retrieve PR comments: {url}: {e}") - return [] +def find_target_url(pr_head: Dict[str, Any]): + for status in pr_head["statusCheckRollup"]["contexts"]["nodes"]: + if status.get("context", "") == "tvm-ci/pr-head": + return status["targetUrl"] + + raise RuntimeError(f"Unable to find tvm-ci/pr-head status in {pr_head}") def get_pr_and_build_numbers(target_url): @@ -49,62 +39,15 @@ def get_pr_and_build_numbers(target_url): return {"pr_number": pr_number, "build_number": build_number} -def search_for_docs_comment(comments): - for comment in comments: - if ( - comment["user"]["login"] == GITHUB_ACTIONS_BOT_LOGIN - and DOCS_BOT_MARKER in comment["body"] - ): - return comment - return None - - -if __name__ == "__main__": - help = "Add comment with link to docs" - parser = argparse.ArgumentParser(description=help) - parser.add_argument("--remote", default="origin", help="ssh remote to parse") - parser.add_argument("--base-url-docs", default="https://pr-docs.tlcpack.ai") - parser.add_argument( - "--dry-run", - action="store_true", - default=False, - help="run but don't send any request to GitHub", - ) - args = parser.parse_args() - init_log() - - remote = git(["config", "--get", f"remote.{args.remote}.url"]) - user, repo = parse_remote(remote) - - target_url = os.environ["TARGET_URL"] +def get_doc_url(pr: Dict[str, Any], base_docs_url: str = "https://pr-docs.tlcpack.ai") -> str: + pr_head = pr["commits"]["nodes"][0]["commit"] + target_url = find_target_url(pr_head) pr_and_build = get_pr_and_build_numbers(target_url) - commit_sha = os.environ["COMMIT_SHA"] + commit_sha = pr_head["oid"] docs_url = build_docs_url( - args.base_url_docs, pr_and_build["pr_number"], pr_and_build["build_number"] + base_docs_url, pr_and_build["pr_number"], pr_and_build["build_number"] ) - url = f'issues/{pr_and_build["pr_number"]}/comments' - body = f"{DOCS_BOT_MARKER}Built docs for commit {commit_sha} can be found [here]({docs_url})." - if not args.dry_run: - github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo) - - # For now, only comment for PRs open by driazati, gigiblender and areusch. - get_pr_url = f'pulls/{pr_and_build["pr_number"]}' - pull_request_body = github.get(get_pr_url) - author = pull_request_body["user"]["login"] - if author not in ["driazati", "gigiblender", "areusch"]: - logging.info(f"Skipping this action for user {author}") - sys.exit(0) - - pr_comments = get_pr_comments(github, url) - comment = search_for_docs_comment(pr_comments) - - if comment is not None: - comment_url = comment["url"] - github.patch(comment_url, {"body": body}) - else: - github.post(url, {"body": body}) - else: - logging.info(f"Dry run, would have posted {url} with data {body}.") + return f"Built docs for commit {commit_sha} can be found [here]({docs_url})." diff --git a/ci/scripts/github_pr_comment.py b/ci/scripts/github_pr_comment.py new file mode 100755 index 000000000000..bcf4c5096ab0 --- /dev/null +++ b/ci/scripts/github_pr_comment.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import argparse +import os +import json + +from git_utils import git, GitHubRepo, parse_remote, DRY_RUN +from cmd_utils import init_log +from github_commenter import BotCommentBuilder +from github_skipped_tests_comment import get_skipped_tests_comment +from github_tag_teams import get_tags +from github_docs_comment import get_doc_url + +PR_QUERY = """ + query ($owner: String!, $name: String!, $number: Int!) { + repository(owner: $owner, name: $name) { + pullRequest(number: $number) { + title + body + state + number + author { + login + } + labels(first:100) { + nodes { + name + } + } + comments(last: 100) { + pageInfo { + hasPreviousPage + } + nodes { + author { + login + } + databaseId + body + } + } + commits(last: 1) { + nodes { + commit { + oid + statusCheckRollup { + contexts(first: 100) { + pageInfo { + hasNextPage + } + nodes { + ... on StatusContext { + state + context + targetUrl + } + } + } + } + } + } + } + } + } + } +""" + + +if __name__ == "__main__": + help = "Comment a welcome message on PRs" + parser = argparse.ArgumentParser(description=help) + parser.add_argument("--remote", default="origin", help="ssh remote to parse") + parser.add_argument("--pr", required=True) + parser.add_argument("--test-data", help="(testing) mock GitHub API data") + parser.add_argument("--test-comments", help="(testing) testing comments") + parser.add_argument( + "--dry-run", + action="store_true", + default=False, + help="run but don't send any request to GitHub", + ) + args = parser.parse_args() + init_log() + + remote = git(["config", "--get", f"remote.{args.remote}.url"]) + user, repo = parse_remote(remote) + + test_data = None + if args.test_data is not None: + test_data = json.loads(args.test_data) + + github = GitHubRepo( + user=user, + repo=repo, + token=DRY_RUN if args.dry_run else os.environ["GITHUB_TOKEN"], + test_data=test_data, + ) + + pr_data = github.graphql( + PR_QUERY, + { + "owner": user, + "name": repo, + "number": int(args.pr), + }, + ) + + pr_data = pr_data["data"]["repository"]["pullRequest"] + commenter = BotCommentBuilder(github=github, data=pr_data) + + if args.test_comments is not None: + test_comments = json.loads(args.test_comments) + skipped_tests = test_comments["skipped-tests"] + ccs = test_comments["ccs"] + docs_info = test_comments["docs"] + else: + skipped_tests = get_skipped_tests_comment(pr_data, github=github) + ccs = get_tags(pr_data, github, team_issue=10317) + docs_info = get_doc_url(pr_data) + + items = { + "ccs": ccs, + "skipped-tests": skipped_tests, + "docs": docs_info, + } + commenter.post_items(items=items.items()) diff --git a/ci/scripts/github_skipped_tests_comment.py b/ci/scripts/github_skipped_tests_comment.py index c07fbf4a8463..7a62f16a5b81 100755 --- a/ci/scripts/github_skipped_tests_comment.py +++ b/ci/scripts/github_skipped_tests_comment.py @@ -15,23 +15,12 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import json import os import logging -import argparse import subprocess -import sys -from urllib import error from xml.etree import ElementTree - -from git_utils import git, GitHubRepo, parse_remote -from cmd_utils import init_log - -SKIPPED_TESTS_COMMENT_MARKER = "\n\n" -GITHUB_ACTIONS_BOT_LOGIN = "github-actions[bot]" - -PR_TEST_REPORT_DIR = "pr-reports" -MAIN_TEST_REPORT_DIR = "main-reports" +from pathlib import Path +from typing import Dict, Any, Optional def run_subprocess(command): @@ -43,7 +32,7 @@ def run_subprocess(command): def retrieve_test_report(s3_url, target_dir): - command = f"aws s3 cp {s3_url} {target_dir} --recursive" + command = f"aws --region us-west-2 s3 cp {s3_url} {target_dir} --recursive --no-sign-request" run_subprocess(command) @@ -70,14 +59,16 @@ def get_main_jenkins_build_number(github, common_commit): raise RuntimeError(f"Failed to find main build number for commit {common_commit}") -def retrieve_test_reports(common_main_build, pr_number, build_number, s3_prefix): +def retrieve_test_reports( + common_main_build, pr_number, build_number, s3_prefix, pr_test_report_dir, main_test_report_dir +): cur_build_s3_link = ( f"s3://{s3_prefix}/tvm/PR-{str(pr_number)}/{str(build_number)}/pytest-results" ) - retrieve_test_report(cur_build_s3_link, PR_TEST_REPORT_DIR) + retrieve_test_report(cur_build_s3_link, pr_test_report_dir) common_build_s3_link = f"s3://{s3_prefix}/tvm/main/{common_main_build}/pytest-results" - retrieve_test_report(common_build_s3_link, MAIN_TEST_REPORT_DIR) + retrieve_test_report(common_build_s3_link, main_test_report_dir) def get_pr_and_build_numbers(target_url): @@ -89,15 +80,16 @@ def get_pr_and_build_numbers(target_url): def build_test_set(directory): + directory = Path(directory) subdir_to_skipped = {} subdirs = [ item for item in os.listdir(directory) if os.path.isdir(os.path.join(directory, item)) ] for subdir in subdirs: subdir_to_skipped[subdir] = set() - for root, _, files in os.walk(directory + "/" + subdir): + for root, _, files in os.walk(directory / subdir): for file in files: - test_report = ElementTree.parse(root + "/" + file) + test_report = ElementTree.parse(Path(root) / file) for testcase in test_report.iter("testcase"): skipped = testcase.find("skipped") if skipped is not None: @@ -120,13 +112,13 @@ def build_comment( jenkins_prefix, ): if common_main_build["state"] != "success": - return f"{SKIPPED_TESTS_COMMENT_MARKER}Unable to run tests bot because main failed to pass CI at {common_commit_sha}." + return f"Unable to run tests bot because main failed to pass CI at {common_commit_sha}." if len(skipped_list) == 0: - return f"{SKIPPED_TESTS_COMMENT_MARKER}No additional skipped tests found in this branch for commit {commit_sha}." + return f"No additional skipped tests found in this branch for commit {commit_sha}." text = ( - f"{SKIPPED_TESTS_COMMENT_MARKER}The list below shows some tests that ran in main {common_commit_sha} but were " + f"The list below shows some tests that ran in main {common_commit_sha} but were " f"skipped in the CI build of {commit_sha}:\n" f"```\n" ) @@ -139,68 +131,51 @@ def build_comment( return text -def get_pr_comments(github, url): - try: - return github.get(url) - except error.HTTPError as e: - logging.exception(f"Failed to retrieve PR comments: {url}: {e}") - return [] +def find_target_url(pr_head: Dict[str, Any]): + for status in pr_head["statusCheckRollup"]["contexts"]["nodes"]: + if status.get("context", "") == "tvm-ci/pr-head": + return status["targetUrl"] + raise RuntimeError(f"Unable to find tvm-ci/pr-head status in {pr_head}") -def search_for_docs_comment(comments): - for comment in comments: - if ( - comment["user"]["login"] == GITHUB_ACTIONS_BOT_LOGIN - and SKIPPED_TESTS_COMMENT_MARKER in comment["body"] - ): - return comment - return None - -if __name__ == "__main__": - help = ( - "Compares the skipped tests of this PR against the last successful build on main. Also comments on the PR " - "issue when tests are skipped in this PR and not on main." - ) - parser = argparse.ArgumentParser(description=help) - parser.add_argument("--remote", default="origin", help="ssh remote to parse") - parser.add_argument("--s3-prefix", default="tvm-jenkins-artifacts-prod") - parser.add_argument("--jenkins-prefix", default="ci.tlcpack.ai") - parser.add_argument("--common-main-build") - parser.add_argument( - "--dry-run", - action="store_true", - default=False, - help="run but don't send any request to GitHub", - ) - args = parser.parse_args() - init_log() - - remote = git(["config", "--get", f"remote.{args.remote}.url"]) - user, repo = parse_remote(remote) - - target_url = os.environ["TARGET_URL"] +def get_skipped_tests_comment( + pr: Dict[str, Any], + github, + s3_prefix: str = "tvm-jenkins-artifacts-prod", + jenkins_prefix: str = "ci.tlcpack.ai", + pr_test_report_dir: str = "pr-reports", + main_test_report_dir: str = "main-reports", + common_commit_sha: Optional[str] = None, + common_main_build: Optional[Dict[str, Any]] = None, +) -> str: + pr_head = pr["commits"]["nodes"][0]["commit"] + target_url = find_target_url(pr_head) pr_and_build = get_pr_and_build_numbers(target_url) + logging.info(f"Getting comment for {pr_head} with target {target_url}") + + commit_sha = pr_head["oid"] - commit_sha = os.environ["COMMIT_SHA"] + is_dry_run = common_commit_sha is not None - if not args.dry_run: - github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo) + if not is_dry_run: + logging.info("Fetching common commit sha and build info") common_commit_sha = get_common_commit_sha() common_main_build = get_main_jenkins_build_number(github, common_commit_sha) + retrieve_test_reports( common_main_build=common_main_build["build_number"], pr_number=pr_and_build["pr_number"], build_number=pr_and_build["build_number"], - s3_prefix=args.s3_prefix, + s3_prefix=s3_prefix, + main_test_report_dir=main_test_report_dir, + pr_test_report_dir=pr_test_report_dir, ) else: - assert args.common_main_build is not None - common_main_build = json.loads(args.common_main_build) - common_commit_sha = os.environ["COMMIT_SHA"] + logging.info("Dry run, expecting PR and main reports on disk") - main_tests = build_test_set(MAIN_TEST_REPORT_DIR) - build_tests = build_test_set(PR_TEST_REPORT_DIR) + main_tests = build_test_set(main_test_report_dir) + build_tests = build_test_set(pr_test_report_dir) skipped_list = [] for subdir, skipped_set in build_tests.items(): @@ -227,28 +202,7 @@ def search_for_docs_comment(comments): pr_and_build["pr_number"], pr_and_build["build_number"], commit_sha, - args.jenkins_prefix, + jenkins_prefix, ) - url = f'issues/{pr_and_build["pr_number"]}/comments' - if not args.dry_run: - # For now, only comment for PRs open by driazati, gigiblender and areusch. - get_pr_url = f'pulls/{pr_and_build["pr_number"]}' - pull_request_body = github.get(get_pr_url) - author = pull_request_body["user"]["login"] - if author not in ["driazati", "gigiblender", "areusch"]: - logging.info(f"Skipping this action for user {author}") - sys.exit(0) - - pr_comments = get_pr_comments(github, url) - comment = search_for_docs_comment(pr_comments) - - if comment is not None: - comment_url = comment["url"] - comment_id = comment_url[comment_url.find("comments/") : len(comment_url)].strip( - "comments/" - ) - github.patch(f"issues/comments/{comment_id}", {"body": body}) - else: - github.post(url, {"body": body}) - else: - logging.info(f"Dry run, would have posted {url} with data {body}.") + + return body diff --git a/ci/scripts/github_tag_teams.py b/ci/scripts/github_tag_teams.py index 4f03b4f71aea..fd63070db1ba 100755 --- a/ci/scripts/github_tag_teams.py +++ b/ci/scripts/github_tag_teams.py @@ -19,13 +19,13 @@ import os import json import argparse +import logging import re -from urllib import error -from typing import Dict, Any, List, Tuple +from typing import Dict, Any, List, Tuple, Optional -from git_utils import git, GitHubRepo, parse_remote, find_ccs -from cmd_utils import tags_from_title +from git_utils import git, GitHubRepo, parse_remote, find_ccs, dry_run_token +from cmd_utils import tags_from_title, init_log GITHUB_NAME_REGEX = r"@[a-zA-Z0-9-]+" @@ -168,6 +168,51 @@ def gen_cc_line(users): return "\n".join(lines) +def determine_users_to_cc( + issue: Dict[str, Any], github: GitHubRepo, team_issue: str, issue_data: Optional[Dict[str, Any]] +) -> List[str]: + if issue_data is None: + issue_data = fetch_issue(github, issue_number=int(team_issue)) + + # Fetch the list of teams + teams = parse_teams(issue_data, issue_number=int(team_issue)) + + logging.info(f"Found these teams in issue #{team_issue}\n{json.dumps(teams, indent=2)}") + + title = issue["title"] + if "author" in issue: + author = issue["author"]["login"] + else: + author = issue["user"]["login"] + tags = tags_from_title(title) + if isinstance(issue["labels"], dict): + tags += tags_from_labels(issue["labels"]["nodes"]) + else: + tags += tags_from_labels(issue["labels"]) + + tags = [t.lower() for t in tags] + logging.info(f"Found tags: {tags}") + + # Update the PR or issue based on tags in the title and GitHub tags + to_cc = [teams.get(t, []) for t in tags] + to_cc = list(set(item for sublist in to_cc for item in sublist)) + to_cc = [user for user in to_cc if user != author] + return to_cc + + +def get_tags(pr_data: Dict[str, Any], github: GitHubRepo, team_issue: int) -> str: + to_cc = determine_users_to_cc( + issue=pr_data, github=github, team_issue=team_issue, issue_data=None + ) + + logging.info(f"Users to cc based on labels: {to_cc}") + description = "See [#10317](https://github.com/apache/tvm/issues/10317) for details" + if len(to_cc) == 0: + return "No users to tag found in teams " + description + + return "cc " + ", ".join([f"@{user}" for user in to_cc]) + " " + description + + if __name__ == "__main__": help = "Automatically tag people based on PR / issue labels" parser = argparse.ArgumentParser(description=help) @@ -183,21 +228,17 @@ def gen_cc_line(users): help="run but don't send any request to GitHub", ) args = parser.parse_args() + init_log() remote = git(["config", "--get", f"remote.{args.remote}.url"]) user, repo = parse_remote(remote) + github = GitHubRepo(token=dry_run_token(args.dry_run), user=user, repo=repo) if args.team_issue_json: issue_data = json.loads(args.team_issue_json) else: - github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo) issue_data = fetch_issue(github, issue_number=int(args.team_issue)) - # Fetch the list of teams - teams = parse_teams(issue_data, issue_number=int(args.team_issue)) - - print(f"Found these teams in issue #{args.team_issue}\n{json.dumps(teams, indent=2)}") - # Extract the payload from GitHub Actions issue = json.loads(os.getenv("ISSUE", "null")) pr = json.loads(os.getenv("PR", "null")) @@ -213,33 +254,27 @@ def gen_cc_line(users): item = issue if issue is not None else pr title = item["title"] body = item["body"] - author = item["user"]["login"] - tags = tags_from_title(item["title"]) + tags_from_labels(item["labels"]) - tags = [t.lower() for t in tags] - print(f"Found tags: {tags}") - - # Update the PR or issue based on tags in the title and GitHub tags - to_cc = [teams.get(t, []) for t in tags] - to_cc = list(set(item for sublist in to_cc for item in sublist)) - to_cc = [user for user in to_cc if user != author] + to_cc = determine_users_to_cc( + issue=item, github=github, team_issue=args.team_issue, issue_data=issue_data + ) existing_tags = list(set(re.findall(GITHUB_NAME_REGEX, body))) existing_tags = set(tag.replace("@", "") for tag in existing_tags) - print(f"Found existing tags: {existing_tags}") + logging.info(f"Found existing tags: {existing_tags}") to_cc = [user for user in to_cc if user not in existing_tags] - print("Users to cc based on labels", to_cc) + logging.info("Users to cc based on labels", to_cc) # Create the new PR/issue body if len(to_cc) == 0: - print("No one to cc, exiting") + logging.info("No one to cc, exiting") exit(0) new_body = add_ccs_to_body(body, to_cc) if new_body is None: - print(f"Everyone to cc is already cc'ed, no update needed") + logging.info(f"Everyone to cc is already cc'ed, no update needed") exit(0) - print(f"Changing body from:\n----\n{body}\n----\nto:\n----\n{new_body}\n----") + logging.info(f"Changing body from:\n----\n{body}\n----\nto:\n----\n{new_body}\n----") # Set the PR/issue body on GitHub data = {"body": new_body} @@ -255,4 +290,4 @@ def gen_cc_line(users): if not args.dry_run: github.post(url, data=data) else: - print(f"Dry run, would have updated {url} with {data}") + logging.info(f"Dry run, would have updated {url} with {data}") diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py index 6c25694cfc74..4b8c5d9ad444 100644 --- a/tests/python/ci/test_ci.py +++ b/tests/python/ci/test_ci.py @@ -19,12 +19,23 @@ import subprocess import json import textwrap +import sys +import logging from pathlib import Path import pytest import tvm.testing + from .test_utils import REPO_ROOT, TempGit, run_script +# pylint: disable=wrong-import-position,wrong-import-order +sys.path.insert(0, str(REPO_ROOT / "ci")) +sys.path.insert(0, str(REPO_ROOT / "ci" / "scripts")) + +import scripts + +# pylint: enable=wrong-import-position,wrong-import-order + def parameterize_named(**kwargs): keys = next(iter(kwargs.values())).keys() @@ -71,9 +82,8 @@ def parameterize_named(**kwargs): "s3_prefix": "tvm-jenkins-artifacts-prod", "jenkins_prefix": "ci.tlcpack.ai", "common_main_build": """{"build_number": "4115", "state": "success"}""", - "commit_sha": "SHA", - "expected_url": "issues/11594/comments", - "expected_body": """\n\nThe list below shows some tests that ran in main SHA but were skipped in the CI build of SHA:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\nunittest -> ctypes.tests.python.unittest.test_roofline#test_estimate_peak_bandwidth[cuda]\n```\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).""", + "commit_sha": "sha1234", + "expected_body": "The list below shows some tests that ran in main sha1234 but were skipped in the CI build of sha1234:\n```\nunittest -> ctypes.tests.python.unittest.test_auto_scheduler_search_policy#test_sketch_search_policy_cuda_rpc_runner\nunittest -> ctypes.tests.python.unittest.test_roofline#test_estimate_peak_bandwidth[cuda]\n```\nA detailed report of ran tests is [here](https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/testReport/).", }, "no-diff": { "main_xml_file": "unittest/file1.xml", @@ -108,9 +118,8 @@ def parameterize_named(**kwargs): "s3_prefix": "tvm-jenkins-artifacts-prod", "jenkins_prefix": "ci.tlcpack.ai", "common_main_build": """{"build_number": "4115", "state": "success"}""", - "commit_sha": "SHA", - "expected_url": "issues/11594/comments", - "expected_body": """\n\nNo additional skipped tests found in this branch for commit SHA.""", + "commit_sha": "sha1234", + "expected_body": "No additional skipped tests found in this branch for commit sha1234.", }, "unable-to-run": { "main_xml_file": "unittest/file1.xml", @@ -127,9 +136,8 @@ def parameterize_named(**kwargs): "s3_prefix": "tvm-jenkins-artifacts-prod", "jenkins_prefix": "ci.tlcpack.ai", "common_main_build": """{"build_number": "4115", "state": "failed"}""", - "commit_sha": "SHA", - "expected_url": "issues/11594/comments", - "expected_body": """\n\nUnable to run tests bot because main failed to pass CI at SHA.""", + "commit_sha": "sha1234", + "expected_body": "Unable to run tests bot because main failed to pass CI at sha1234.", }, } # pylint: enable=line-too-long @@ -139,6 +147,7 @@ def parameterize_named(**kwargs): @parameterize_named(**TEST_DATA_SKIPPED_BOT) # pylint: enable=line-too-long def test_skipped_tests_comment( + caplog, tmpdir_factory, main_xml_file, main_xml_content, @@ -149,13 +158,11 @@ def test_skipped_tests_comment( jenkins_prefix, common_main_build, commit_sha, - expected_url, expected_body, ): """ Test that a comment with a link to the docs is successfully left on PRs """ - skipped_tests_script = REPO_ROOT / "ci" / "scripts" / "github_skipped_tests_comment.py" def write_xml_file(root_dir, xml_file, xml_content): shutil.rmtree(root_dir, ignore_errors=True) @@ -165,25 +172,45 @@ def write_xml_file(root_dir, xml_file, xml_content): f.write(textwrap.dedent(xml_content)) git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) - pr_test_report_dir = Path(git.cwd) / "pr-reports" write_xml_file(pr_test_report_dir, pr_xml_file, pr_xml_content) main_test_report_dir = Path(git.cwd) / "main-reports" write_xml_file(main_test_report_dir, main_xml_file, main_xml_content) - proc = run_script( - [ - skipped_tests_script, - "--dry-run", - f"--s3-prefix={s3_prefix}", - f"--jenkins-prefix={jenkins_prefix}", - f"--common-main-build={common_main_build}", - ], - env={"TARGET_URL": target_url, "COMMIT_SHA": commit_sha}, - cwd=git.cwd, - ) - - assert_in(f"Dry run, would have posted {expected_url} with data {expected_body}.", proc.stderr) + pr_data = { + "commits": { + "nodes": [ + { + "commit": { + "oid": commit_sha, + "statusCheckRollup": { + "contexts": { + "nodes": [ + { + "context": "tvm-ci/pr-head", + "targetUrl": target_url, + } + ] + } + }, + } + } + ] + } + } + with caplog.at_level(logging.INFO): + comment = scripts.github_skipped_tests_comment.get_skipped_tests_comment( + pr=pr_data, + github=None, + s3_prefix=s3_prefix, + jenkins_prefix=jenkins_prefix, + common_commit_sha=commit_sha, + pr_test_report_dir=pr_test_report_dir, + main_test_report_dir=main_test_report_dir, + common_main_build=json.loads(common_main_build), + ) + assert_in(expected_body, comment) + assert_in(f"with target {target_url}", caplog.text) @tvm.testing.skip_if_wheel_test @@ -192,27 +219,40 @@ def write_xml_file(root_dir, xml_file, xml_content): target_url="https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect", base_url="https://pr-docs.tlcpack.ai", commit_sha="SHA", - expected_url="issues/11594/comments", - expected_body="\n\nBuilt docs for commit SHA can be found " + expected_body="Built docs for commit SHA can be found " "[here](https://pr-docs.tlcpack.ai/PR-11594/3/docs/index.html).", ) ) -def test_docs_comment( - tmpdir_factory, target_url, base_url, commit_sha, expected_url, expected_body -): +def test_docs_comment(target_url, base_url, commit_sha, expected_body): """ Test that a comment with a link to the docs is successfully left on PRs """ - docs_comment_script = REPO_ROOT / "ci" / "scripts" / "github_docs_comment.py" - - git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) - proc = run_script( - [docs_comment_script, "--dry-run", f"--base-url-docs={base_url}"], - env={"TARGET_URL": target_url, "COMMIT_SHA": commit_sha}, - cwd=git.cwd, + pr_data = { + "commits": { + "nodes": [ + { + "commit": { + "oid": commit_sha, + "statusCheckRollup": { + "contexts": { + "nodes": [ + { + "context": "tvm-ci/pr-head", + "targetUrl": target_url, + } + ] + } + }, + } + } + ] + } + } + comment = scripts.github_docs_comment.get_doc_url( + pr=pr_data, + base_docs_url=base_url, ) - - assert_in(f"Dry run, would have posted {expected_url} with data {expected_body}.", proc.stderr) + assert_in(expected_body, comment) @tvm.testing.skip_if_wheel_test @@ -385,6 +425,149 @@ def test_update_branch(tmpdir_factory, statuses, expected_rc, expected_output): ) +# pylint: disable=line-too-long +@parameterize_named( + author_gate=dict( + pr_author="abc", + comments=[], + expected="Skipping comment for author abc", + ), + new_comment=dict( + pr_author="driazati", + comments=[], + expected="No existing comment found", + ), + update_comment=dict( + pr_author="driazati", + comments=[ + { + "author": {"login": "github-actions"}, + "databaseId": "comment456", + "body": " abc", + } + ], + expected="PATCH to https://api.github.com/repos/apache/tvm/issues/comments/comment456", + ), + new_body=dict( + pr_author="driazati", + comments=[], + expected="Commenting " + + textwrap.dedent( + """ + + + Thanks for contributing to TVM! Please refer to the contributing guidelines https://tvm.apache.org/docs/contribute/ for useful information and tips. Please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @-ing them in a comment. + + + * the cc + * the skipped tests + * the docs + """ + ).strip(), + ), + update_body=dict( + pr_author="driazati", + comments=[ + { + "author": {"login": "github-actions"}, + "databaseId": "comment456", + "body": textwrap.dedent( + """ + + + Thanks for contributing to TVM! Please refer to the contributing guidelines https://tvm.apache.org/docs/contribute/ for useful information and tips. Please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @-ing them in a comment. + + + * the cc + * something else + * the docs + """ + ).strip(), + } + ], + expected="Commenting " + + textwrap.dedent( + """ + + + Thanks for contributing to TVM! Please refer to the contributing guidelines https://tvm.apache.org/docs/contribute/ for useful information and tips. Please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @-ing them in a comment. + + + * the cc + * something else + * the docs + * the skipped tests + """ + ).strip(), + ), +) +# pylint: enable=line-too-long +def test_pr_comment(tmpdir_factory, pr_author, comments, expected): + """ + Test the PR commenting bot + """ + comment_script = REPO_ROOT / "ci" / "scripts" / "github_pr_comment.py" + + git = TempGit(tmpdir_factory.mktemp("tmp_git_dir")) + target_url = "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect" + commit = { + "commit": { + "oid": "sha1234", + "statusCheckRollup": { + "contexts": { + "nodes": [ + { + "context": "tvm-ci/pr-head", + "targetUrl": target_url, + } + ] + } + }, + } + } + data = { + "[1] POST - https://api.github.com/graphql": {}, + "[2] POST - https://api.github.com/graphql": { + "data": { + "repository": { + "pullRequest": { + "number": 1234, + "comments": { + "nodes": comments, + }, + "author": { + "login": pr_author, + }, + "commits": { + "nodes": [commit], + }, + } + } + } + }, + } + comments = { + "ccs": "the cc", + "docs": "the docs", + "skipped-tests": "the skipped tests", + } + proc = run_script( + [ + comment_script, + "--dry-run", + "--test-data", + json.dumps(data), + "--test-comments", + json.dumps(comments), + "--pr", + "1234", + ], + stderr=subprocess.STDOUT, + cwd=git.cwd, + ) + assert_in(expected, proc.stdout) + + @parameterize_named( dont_skip_main=dict( commands=[], @@ -873,6 +1056,7 @@ def test_github_tag_teams(tmpdir_factory, source_type, data, check): "--team-issue-json", json.dumps(teams), ], + stderr=subprocess.STDOUT, cwd=git.cwd, env=env, ) From aded9d43ba1e798031900911cca4613487db84fe Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Fri, 16 Sep 2022 17:01:11 -0500 Subject: [PATCH 192/704] [Testing] Add decorator tvm.testing.requires_cuda_compute_version (#12778) * [Testing] Add decorator tvm.testing.requires_cuda_compute_version Previously, individual unit tests would call `tvm.contrib.nvcc.get_target_compute_version` and return early. This was repeated boilerplate in many tests, and incorrectly reported a test as `PASSED` if the required infrastructure wasn't present. This commit introduces `tvm.testing.requires_cuda_compute_version`, a decorator that checks the CUDA compute version and applies `pytest.mark.skipif`. If required infrastructure isn't present, a test will be reported as `SKIPPED`. * requires_cuda_compute_version skips test when no GPU is present --- python/tvm/testing/utils.py | 44 ++++++ .../python/unittest/test_tir_ptx_cp_async.py | 7 +- .../python/unittest/test_tir_ptx_ldmatrix.py | 8 +- tests/python/unittest/test_tir_ptx_mma.py | 146 +++--------------- tests/python/unittest/test_tir_ptx_mma_sp.py | 14 +- ...est_tir_schedule_tensorize_ldmatrix_mma.py | 13 +- 6 files changed, 71 insertions(+), 161 deletions(-) diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py index 37a27a4213e9..ad1e003d6e3f 100644 --- a/python/tvm/testing/utils.py +++ b/python/tvm/testing/utils.py @@ -1058,6 +1058,50 @@ def inner(func): return inner +def requires_cuda_compute_version(major_version, minor_version=0): + """Mark a test as requiring at least a compute architecture + + Unit test marked with this decorator will run only if the CUDA + compute architecture of the GPU is at least `(major_version, + minor_version)`. + + This also marks the test as requiring a cuda support. + + Parameters + ---------- + major_version: int + + The major version of the (major,minor) version tuple. + + minor_version: int + + The minor version of the (major,minor) version tuple. + """ + min_version = (major_version, minor_version) + try: + arch = tvm.contrib.nvcc.get_target_compute_version() + compute_version = tvm.contrib.nvcc.parse_compute_version(arch) + except ValueError: + # No GPU present. This test will be skipped from the + # requires_cuda() marks as well. + compute_version = (0, 0) + + min_version_str = ".".join(str(v) for v in min_version) + compute_version_str = ".".join(str(v) for v in compute_version) + requires = [ + pytest.mark.skipif( + compute_version < min_version, + reason=f"Requires CUDA compute >= {min_version_str}, but have {compute_version_str}", + ), + *requires_cuda.marks(), + ] + + def inner(func): + return _compose([func], requires) + + return inner + + def skip_if_32bit(reason): def decorator(*args): if "32bit" in platform.architecture()[0]: diff --git a/tests/python/unittest/test_tir_ptx_cp_async.py b/tests/python/unittest/test_tir_ptx_cp_async.py index 5e6535f295cb..dc521f3c471a 100644 --- a/tests/python/unittest/test_tir_ptx_cp_async.py +++ b/tests/python/unittest/test_tir_ptx_cp_async.py @@ -47,14 +47,9 @@ def ptx_cp_async(A: T.Buffer[(32, 128), "float16"], B: T.Buffer[(32, 128), "floa B[tx, i] = A_shared[tx, i] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_ptx_cp_async(): f = ptx_cp_async - arch = tvm.contrib.nvcc.get_target_compute_version() - major, _ = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # Require at least SM80 - return mod = tvm.build(f, target="cuda") A_np = np.random.rand(32, 128).astype("float16") diff --git a/tests/python/unittest/test_tir_ptx_ldmatrix.py b/tests/python/unittest/test_tir_ptx_ldmatrix.py index f718082ff8a1..f652be442133 100644 --- a/tests/python/unittest/test_tir_ptx_ldmatrix.py +++ b/tests/python/unittest/test_tir_ptx_ldmatrix.py @@ -56,15 +56,11 @@ def ptx_ldmatrix( B[8 * j + tx // 4, 8 * k + (tx % 4) * 2 + i] = A_local[4 * k + 2 * j + i] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(7, 5) def test_ptx_ldmatrix(): f = ptx_ldmatrix _, _, param_num, param_trans = f.params - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major * 10 + minor < 75: - # Require at least SM75 - return + for num in [1, 2, 4]: for trans in [False, True]: mod = tvm.build(f.specialize({param_num: num, param_trans: trans}), target="cuda") diff --git a/tests/python/unittest/test_tir_ptx_mma.py b/tests/python/unittest/test_tir_ptx_mma.py index bee9b7b48020..cc9eec3a69d7 100644 --- a/tests/python/unittest/test_tir_ptx_mma.py +++ b/tests/python/unittest/test_tir_ptx_mma.py @@ -66,14 +66,9 @@ def gemm_mma_m8n8k4_row_col_fp64pf64fp64(a: T.handle, b: T.handle, c: T.handle): C[(tx % 32) // 4, (tx % 32) % 4 * 2 + mma_accum_c_id] = Accum[mma_accum_c_id] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_gemm_mma_m8n8k4_row_col_fp64pf64fp64(): sch = tvm.tir.Schedule(gemm_mma_m8n8k4_row_col_fp64pf64fp64) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # Require at least SM80 - return cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-1, 1, [8, 4]).astype("float64") @@ -147,14 +142,9 @@ def gemm_mma_m8n8k4_row_row_fp16fp16fp16(a: T.handle, b: T.handle, c: T.handle): ] = Accum[mma_accum_c_id] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(7) def test_gemm_mma_m8n8k4_row_row_fp16fp16fp16(): sch = tvm.tir.Schedule(gemm_mma_m8n8k4_row_row_fp16fp16fp16) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 7: - # Require at least SM70 - return cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-1, 1, [16, 4]).astype("float16") @@ -235,14 +225,9 @@ def gemm_mma_m8n8k4_row_row_fp16fp16fp32(a: T.handle, b: T.handle, c: T.handle): ] = Accum[mma_accum_c_id] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(7) def test_gemm_mma_m8n8k4_row_row_fp16fp16fp32(): sch = tvm.tir.Schedule(gemm_mma_m8n8k4_row_row_fp16fp16fp32) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 7: - # Require at least SM70 - return cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-1, 1, [16, 4]).astype("float16") @@ -311,14 +296,9 @@ def gemm_mma_m8n8k16_row_col_s8s8s32(a: T.handle, b: T.handle, c: T.handle): # Failure occurs during the external call to nvcc, when attempting to # generate the .fatbin file. @tvm.testing.requires_nvcc_version(11) -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(7, 5) def test_gemm_mma_m8n8k16_row_col_s8s8s32(): sch = tvm.tir.Schedule(gemm_mma_m8n8k16_row_col_s8s8s32) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major * 10 + minor < 75: - # Require at least SM75 - return cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-10, 10, [8, 16]).astype("int8") @@ -387,14 +367,9 @@ def gemm_mma_m8n8k16_row_col_s8u8s32(a: T.handle, b: T.handle, c: T.handle): # Failure occurs during the external call to nvcc, when attempting to # generate the .fatbin file. @tvm.testing.requires_nvcc_version(11) -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(7, 5) def test_gemm_mma_m8n8k16_row_col_s8u8s32(): sch = tvm.tir.Schedule(gemm_mma_m8n8k16_row_col_s8u8s32) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major * 10 + minor < 75: - # Require at least SM75 - return cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-10, 10, [8, 16]).astype("int8") @@ -463,14 +438,9 @@ def gemm_mma_m8n8k32_row_col_s4s4s32(a: T.handle, b: T.handle, c: T.handle): # Failure occurs during the external call to nvcc, when attempting to # generate the .fatbin file. @tvm.testing.requires_nvcc_version(11) -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(7, 5) def test_gemm_mma_m8n8k32_row_col_s4s4s32(): sch = tvm.tir.Schedule(gemm_mma_m8n8k32_row_col_s4s4s32) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major * 10 + minor < 75: - # Require at least SM75 - return cuda_mod = tvm.build(sch.mod, target="cuda") ctx = tvm.cuda() @@ -531,14 +501,9 @@ def gemm_mma_m8n8k32_row_col_s4u4s32(a: T.handle, b: T.handle, c: T.handle): # Failure occurs during the external call to nvcc, when attempting to # generate the .fatbin file. @tvm.testing.requires_nvcc_version(11) -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(7, 5) def test_gemm_mma_m8n8k32_row_col_s4u4s32(): sch = tvm.tir.Schedule(gemm_mma_m8n8k32_row_col_s4u4s32) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major * 10 + minor < 75: - # Require at least SM75 - return cuda_mod = tvm.build(sch.mod, target="cuda") ctx = tvm.cuda() @@ -601,14 +566,9 @@ def gemm_mma_m16n8k8_row_col_fp16fp16fp32(a: T.handle, b: T.handle, c: T.handle) ] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_gemm_mma_m16n8k8_row_col_fp16fp16fp32(): sch = tvm.tir.Schedule(gemm_mma_m16n8k8_row_col_fp16fp16fp32) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # Require at least SM80 - return cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-1, 1, [16, 8]).astype("float16") @@ -682,15 +642,9 @@ def gemm_mma_m16n8k16_row_col_fp16fp16fp16(a: T.handle, b: T.handle, c: T.handle ] = Accum[mma_accum_c_id] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_gemm_mma_m16n8k16_row_col_fp16fp16fp16(): sch = tvm.tir.Schedule(gemm_mma_m16n8k16_row_col_fp16fp16fp16) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # Require at least SM80 - return - cuda_mod = tvm.build(sch.mod, target="cuda") cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-1, 1, [16, 16]).astype("float16") @@ -764,15 +718,9 @@ def gemm_mma_m16n8k16_row_col_fp16fp16fp32(a: T.handle, b: T.handle, c: T.handle ] = Accum[mma_accum_c_id] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_gemm_mma_m16n8k16_row_col_fp16fp16fp32(): sch = tvm.tir.Schedule(gemm_mma_m16n8k16_row_col_fp16fp16fp32) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # Require at least SM80 - return - cuda_mod = tvm.build(sch.mod, target="cuda") cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-1, 1, [16, 16]).astype("float16") @@ -846,15 +794,9 @@ def gemm_mma_m16n8k16_row_col_s8s8s32(a: T.handle, b: T.handle, c: T.handle): ] = Accum[mma_accum_c_id] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_gemm_mma_m16n8k16_row_col_s8s8s32(): sch = tvm.tir.Schedule(gemm_mma_m16n8k16_row_col_s8s8s32) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # Require at least SM80 - return - cuda_mod = tvm.build(sch.mod, target="cuda") cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-10, 10, [16, 16]).astype("int8") @@ -928,15 +870,9 @@ def gemm_mma_m16n8k16_row_col_s8u8s32(a: T.handle, b: T.handle, c: T.handle): ] = Accum[mma_accum_c_id] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_gemm_mma_m16n8k16_row_col_s8u8s32(): sch = tvm.tir.Schedule(gemm_mma_m16n8k16_row_col_s8u8s32) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # Require at least SM80 - return - cuda_mod = tvm.build(sch.mod, target="cuda") cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-10, 10, [16, 16]).astype("int8") @@ -1010,15 +946,9 @@ def gemm_mma_m16n8k32_row_col_s8s8s32(a: T.handle, b: T.handle, c: T.handle): ] = Accum[mma_accum_c_id] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_gemm_mma_m16n8k32_row_col_s8s8s32(): sch = tvm.tir.Schedule(gemm_mma_m16n8k32_row_col_s8s8s32) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # Require at least SM80 - return - cuda_mod = tvm.build(sch.mod, target="cuda") cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-10, 10, [16, 32]).astype("int8") @@ -1092,15 +1022,9 @@ def gemm_mma_m16n8k32_row_col_s8u8s32(a: T.handle, b: T.handle, c: T.handle): ] = Accum[mma_accum_c_id] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_gemm_mma_m16n8k32_row_col_s8u8s32(): sch = tvm.tir.Schedule(gemm_mma_m16n8k32_row_col_s8u8s32) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # Require at least SM80 - return - cuda_mod = tvm.build(sch.mod, target="cuda") cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-10, 10, [16, 32]).astype("int8") @@ -1174,15 +1098,9 @@ def gemm_mma_m16n8k64_row_col_s4s4s32(a: T.handle, b: T.handle, c: T.handle): ] = Accum[mma_accum_c_id] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_gemm_mma_m16n8k64_row_col_s4s4s32(): sch = tvm.tir.Schedule(gemm_mma_m16n8k64_row_col_s4s4s32) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # Require at least SM80 - return - cuda_mod = tvm.build(sch.mod, target="cuda") cuda_mod = tvm.build(sch.mod, target="cuda") ctx = tvm.cuda() @@ -1248,15 +1166,9 @@ def gemm_mma_m16n8k64_row_col_s4u4s32(a: T.handle, b: T.handle, c: T.handle): ] = Accum[mma_accum_c_id] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_gemm_mma_m16n8k64_row_col_s4u4s32(): sch = tvm.tir.Schedule(gemm_mma_m16n8k64_row_col_s4u4s32) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # Require at least SM80 - return - cuda_mod = tvm.build(sch.mod, target="cuda") cuda_mod = tvm.build(sch.mod, target="cuda") ctx = tvm.cuda() @@ -1323,15 +1235,9 @@ def gemm_mma_m16n8k256_row_col_b1b1s32(a: T.handle, b: T.handle, c: T.handle): ] = Accum[mma_accum_c_id] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_gemm_mma_m16n8k256_row_col_b1b1s32(): sch = tvm.tir.Schedule(gemm_mma_m16n8k256_row_col_b1b1s32) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, minor = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # Require at least SM80 - return - cuda_mod = tvm.build(sch.mod, target="cuda") cuda_mod = tvm.build(sch.mod, target="cuda") ctx = tvm.cuda() @@ -1345,20 +1251,4 @@ def test_gemm_mma_m16n8k256_row_col_b1b1s32(): if __name__ == "__main__": - test_gemm_mma_m8n8k4_row_col_fp64pf64fp64() - test_gemm_mma_m8n8k4_row_row_fp16fp16fp16() - test_gemm_mma_m8n8k4_row_row_fp16fp16fp32() - test_gemm_mma_m8n8k16_row_col_s8s8s32() - test_gemm_mma_m8n8k16_row_col_s8u8s32() - test_gemm_mma_m8n8k32_row_col_s4s4s32() - test_gemm_mma_m8n8k32_row_col_s4u4s32() - test_gemm_mma_m16n8k8_row_col_fp16fp16fp32() - test_gemm_mma_m16n8k16_row_col_fp16fp16fp16() - test_gemm_mma_m16n8k16_row_col_fp16fp16fp32() - test_gemm_mma_m16n8k16_row_col_s8s8s32() - test_gemm_mma_m16n8k16_row_col_s8u8s32() - test_gemm_mma_m16n8k32_row_col_s8s8s32() - test_gemm_mma_m16n8k32_row_col_s8u8s32() - test_gemm_mma_m16n8k64_row_col_s4s4s32() - test_gemm_mma_m16n8k64_row_col_s4u4s32() - test_gemm_mma_m16n8k256_row_col_b1b1s32() + tvm.testing.main() diff --git a/tests/python/unittest/test_tir_ptx_mma_sp.py b/tests/python/unittest/test_tir_ptx_mma_sp.py index 24170b4898f9..0b5073864a43 100644 --- a/tests/python/unittest/test_tir_ptx_mma_sp.py +++ b/tests/python/unittest/test_tir_ptx_mma_sp.py @@ -255,7 +255,7 @@ def mma_sp_m16n8k32_f16f16f32(a: T.handle, b: T.handle, c: T.handle, _metadata: C[i // 2 * 8 + tx // 4, tx % 4 * 2 + i % 2] = accum[i] -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_mma_sp_m16n8k16_f16(): def get_meta_m16n8k16_half(mask): assert mask.shape == (16, 4, 2) @@ -273,11 +273,6 @@ def get_meta_m16n8k16_half(mask): for out_dtype in ["float16", "float32"]: func = mma_sp_m16n8k16_f16f16f16 if out_dtype == "float16" else mma_sp_m16n8k16_f16f16f32 sch = tvm.tir.Schedule(func) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, _ = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # Requires SM80+ - return cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-1, 1, [16, 8]).astype("float16") @@ -297,7 +292,7 @@ def get_meta_m16n8k16_half(mask): tvm.testing.assert_allclose(C_tvm.numpy(), C_np, atol=1e-3, rtol=1e-3) -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_mma_sp_m16n8k32_f16(): def get_meta_m16n8k32_half(mask): assert mask.shape == (16, 8, 2) @@ -317,11 +312,6 @@ def get_meta_m16n8k32_half(mask): for out_dtype in ["float16", "float32"]: func = mma_sp_m16n8k32_f16f16f16 if out_dtype == "float16" else mma_sp_m16n8k32_f16f16f32 sch = tvm.tir.Schedule(func) - arch = tvm.contrib.nvcc.get_target_compute_version() - major, _ = tvm.contrib.nvcc.parse_compute_version(arch) - if major < 8: - # Requires SM80+ - return cuda_mod = tvm.build(sch.mod, target="cuda") A_np = np.random.uniform(-1, 1, [16, 16]).astype("float16") diff --git a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py index 32c1625653e5..2eda2b9ec458 100644 --- a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py +++ b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py @@ -111,9 +111,6 @@ def run_test( mma_store_intrin, ) - if not tvm.testing.is_ampere_or_newer(): - return None - f = tvm.build(sch.mod["main"], target="cuda", name="dense") dev = tvm.device("cuda", 0) @@ -155,7 +152,7 @@ def run_test( return lambda: f.time_evaluator(f.entry_name, dev, number=500)(a, b, c) -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_f16f16f32_m16n16k16(): def index_map(i, j): return ( @@ -212,7 +209,7 @@ def index_map(i, j): print("f16f16f32_m16n16k16_trans: %f GFLOPS" % (gflops / (timer().mean))) -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_f16f16f16_m16n16k16(): def index_map(i, j): return ( @@ -269,7 +266,7 @@ def index_map(i, j): print("f16f16f16_m16n16k16_trans: %f GFLOPS" % (gflops / (timer().mean))) -@tvm.testing.requires_cuda +@tvm.testing.requires_cuda_compute_version(8) def test_i8i8i32_m16n16k32(): def index_map_A(i, j): return ( @@ -341,6 +338,4 @@ def index_map_C(i, j): if __name__ == "__main__": - test_f16f16f32_m16n16k16() - test_f16f16f16_m16n16k16() - test_i8i8i32_m16n16k32() + tvm.testing.main() From bb80f19ea8493af71c6130301f1b479143d213ee Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Fri, 16 Sep 2022 15:24:03 -0700 Subject: [PATCH 193/704] [Hexagon] Add debug option to hexagon pytest (#12795) * add debug option to hexagon pytest * address comment --- python/tvm/contrib/hexagon/build.py | 9 +++++---- python/tvm/contrib/hexagon/pytest_plugin.py | 21 +++++++++++++++++---- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py index fe7434f7386d..8960d110b85e 100644 --- a/python/tvm/contrib/hexagon/build.py +++ b/python/tvm/contrib/hexagon/build.py @@ -145,7 +145,7 @@ def start_server(self): ... @abc.abstractmethod - def stop_server(self): + def stop_server(self, cleanup=True): """Stop the RPC server""" ... @@ -509,11 +509,12 @@ def start_server(self): self._copy_binaries() self._run_server_script() - def stop_server(self): + def stop_server(self, cleanup=True): """Abstract method implementation. See description in HexagonLauncherRPC.""" self._cleanup_port_forwarding() self._terminate_remote() - self.cleanup_directory() + if cleanup: + self.cleanup_directory() class HexagonLauncherSimulator(HexagonLauncherRPC): @@ -617,7 +618,7 @@ def _start(self): def cleanup_directory(self): """Abstract method implementation. See description in HexagonLauncherRPC.""" - def stop_server(self): + def stop_server(self, cleanup=True): """Abstract method implementation. See description in HexagonLauncherRPC.""" self._server_process.terminate() diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py index 0b9f65540c34..03f4a1a143c2 100644 --- a/python/tvm/contrib/hexagon/pytest_plugin.py +++ b/python/tvm/contrib/hexagon/pytest_plugin.py @@ -158,7 +158,7 @@ def adb_server_socket() -> str: @pytest.fixture(scope="session") def hexagon_server_process( - request, rpc_server_port_for_session, adb_server_socket, skip_rpc + request, rpc_server_port_for_session, adb_server_socket, skip_rpc, hexagon_debug ) -> HexagonLauncherRPC: """Initials and returns hexagon launcher if ANDROID_SERIAL_NUMBER is defined. This launcher is started only once per test session. @@ -194,7 +194,7 @@ def hexagon_server_process( yield {"launcher": launcher, "device_adr": device_adr} finally: if not skip_rpc: - launcher.stop_server() + launcher.stop_server(cleanup=(not hexagon_debug)) def read_device_list(): @@ -221,6 +221,7 @@ def hexagon_launcher( tvm_tracker_host, tvm_tracker_port, adb_server_socket, + hexagon_debug, ) -> HexagonLauncherRPC: """Initials and returns hexagon launcher which reuses RPC info and Android serial number.""" android_serial_num = android_serial_number() @@ -246,8 +247,9 @@ def hexagon_launcher( yield launcher finally: if android_serial_num == ["simulator"]: - launcher.stop_server() - launcher.cleanup_directory() + launcher.stop_server(cleanup=(not hexagon_debug)) + elif not hexagon_debug: + launcher.cleanup_directory() @pytest.fixture @@ -297,6 +299,11 @@ def skip_rpc(request) -> bool: return request.config.getoption("--skip-rpc") +@pytest.fixture(scope="session") +def hexagon_debug(request) -> bool: + return request.config.getoption("--hexagon-debug") + + def pytest_addoption(parser): parser.addoption("--gtest_args", action="store", default="") @@ -306,6 +313,12 @@ def pytest_addoption(parser): default=False, help="If set true, the RPC server initialization on Android would be skipped", ) + parser.addoption( + "--hexagon-debug", + action="store_true", + default=False, + help="If set true, it will keep the hexagon test directories on the target.", + ) def pytest_generate_tests(metafunc): From 38f53e8c95d6b4387510e38da89b02edb913e886 Mon Sep 17 00:00:00 2001 From: Janet Schneider Date: Fri, 16 Sep 2022 16:25:28 -0700 Subject: [PATCH 194/704] [Hexagon] [runtime] Improve runtime resource management (#12727) * First pass at improving runtime resource management * Add unit test * Fix lint and clang format errors * Disable resource reset for simulator * Moved acquire/release calls to session object, separate buffer managers for non-runtime (static) and runtime (dynamic). * Fix lint errors * Fix lint errors * Improve robustness of session shutdown * Fix lint * Address feedback * Only allow call to Acquire in a clean state * Use a pointer to indicate the "active" manager --- python/tvm/contrib/hexagon/session.py | 15 +++++++++-- src/runtime/hexagon/hexagon_device_api.cc | 26 +++++++++++++----- src/runtime/hexagon/hexagon_device_api.h | 27 ++++++++++++++++++- .../hexagon/hexagon_device_api_tests.cc | 18 +++++++++++++ 4 files changed, 76 insertions(+), 10 deletions(-) diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py index 5619d036e283..e242a95aa8b8 100644 --- a/python/tvm/contrib/hexagon/session.py +++ b/python/tvm/contrib/hexagon/session.py @@ -88,14 +88,25 @@ def __enter__(self): self._rpc_receive_buffer_size_bytes, ], ) + func = self._rpc.get_function("device_api.hexagon.acquire_resources") + func() return self except RuntimeError as exception: raise exception def __exit__(self, exc_type, exc_value, exc_traceback): - # close session to the tracker - del self._rpc + try: + func = self._rpc.get_function("device_api.hexagon.release_resources") + func() + except RuntimeError as exception: + print( + "Exception occurred while calling release_resources() during Session __exit__: ", + exception, + ) + finally: + # close session to the tracker + del self._rpc @property def device(self): diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index fd3a0db2025b..463d9799b082 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -92,16 +92,16 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap if (ndim == 0) { // Allocate storage for a single scalar value. - return hexbuffs.AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope); + return mgr->AllocateHexagonBuffer(typesize, kHexagonAllocAlignment, mem_scope); } else if (ndim == 1) { // Allocate a single, contiguous memory region. size_t nbytes = shape[0] * typesize; - return hexbuffs.AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope); + return mgr->AllocateHexagonBuffer(nbytes, kHexagonAllocAlignment, mem_scope); } else if (ndim == 2) { // Allocate the region(s) needed for Hexagon's indirect-tensor format. size_t nallocs = shape[0]; size_t nbytes = shape[1] * typesize; - return hexbuffs.AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment, mem_scope); + return mgr->AllocateHexagonBuffer(nallocs, nbytes, kHexagonAllocAlignment, mem_scope); } else { return nullptr; // unreachable } @@ -115,13 +115,13 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignme if (alignment < kHexagonAllocAlignment) { alignment = kHexagonAllocAlignment; } - return hexbuffs.AllocateHexagonBuffer(nbytes, alignment, String("global")); + return mgr->AllocateHexagonBuffer(nbytes, alignment, String("global")); } void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) { CHECK(ptr) << "buffer pointer is null"; CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type; - hexbuffs.FreeHexagonBuffer(ptr); + mgr->FreeHexagonBuffer(ptr); } // WorkSpace: runtime allocations for Hexagon @@ -137,7 +137,7 @@ void* HexagonDeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_ void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) { CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type; - CHECK(hexbuffs.count(data) != 0) + CHECK(mgr->count(data) != 0) << "Attempt made to free unknown or already freed workspace allocation"; dmlc::ThreadLocalStore::Get()->FreeWorkspace(dev, data); } @@ -161,7 +161,7 @@ void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHan CHECK_EQ(to->byte_offset, 0); CHECK_EQ(GetDataSize(*from), GetDataSize(*to)); - auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* { return hexbuffs.find(ptr); }; + auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* { return mgr->find(ptr); }; HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data); HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data); @@ -246,6 +246,18 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.free_nd").set_body([](TVMArgs args, TVMR *rv = static_cast(0); }); +TVM_REGISTER_GLOBAL("device_api.hexagon.acquire_resources") + .set_body([](TVMArgs args, TVMRetValue* rv) { + HexagonDeviceAPI* api = HexagonDeviceAPI::Global(); + api->AcquireResources(); + }); + +TVM_REGISTER_GLOBAL("device_api.hexagon.release_resources") + .set_body([](TVMArgs args, TVMRetValue* rv) { + HexagonDeviceAPI* api = HexagonDeviceAPI::Global(); + api->ReleaseResources(); + }); + TVM_REGISTER_GLOBAL("device_api.hexagon").set_body([](TVMArgs args, TVMRetValue* rv) { DeviceAPI* ptr = HexagonDeviceAPI::Global(); *rv = static_cast(ptr); diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h index 4da12e35fbe7..b8861238771b 100644 --- a/src/runtime/hexagon/hexagon_device_api.h +++ b/src/runtime/hexagon/hexagon_device_api.h @@ -45,11 +45,29 @@ class HexagonDeviceAPI final : public DeviceAPI { static HexagonDeviceAPI* Global(); //! \brief Constructor - HexagonDeviceAPI() {} + HexagonDeviceAPI() { mgr = &hexbuffs; } //! \brief Destructor ~HexagonDeviceAPI() {} + //! \brief Ensures resource managers are in a good state for the runtime + void AcquireResources() { + CHECK_EQ(runtime_hexbuffs, nullptr); + runtime_hexbuffs = std::make_unique(); + LOG(INFO) << "runtime_hexbuffs created"; + mgr = runtime_hexbuffs.get(); + } + + //! \brief Ensures all runtime resources are freed + void ReleaseResources() { + if (runtime_hexbuffs && !runtime_hexbuffs->empty()) { + LOG(INFO) << "runtime_hexbuffs was not empty in ReleaseResources"; + } + mgr = &hexbuffs; + LOG(INFO) << "runtime_hexbuffs reset"; + runtime_hexbuffs.reset(); + } + /*! \brief Currently unimplemented interface to specify the active * Hexagon device. */ @@ -138,7 +156,14 @@ class HexagonDeviceAPI final : public DeviceAPI { } //! \brief Manages underlying HexagonBuffer allocations + // runtime_hexbuffs is used for runtime allocations. It is created + // with a call to AcquireResources, and destroyed on ReleaseResources. + // hexbuffs is used for all allocations outside of the session lifetime. HexagonBufferManager hexbuffs; + std::unique_ptr runtime_hexbuffs; + + //! \brief Current buffer manager + HexagonBufferManager* mgr; }; } // namespace hexagon } // namespace runtime diff --git a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc index fbcee37cb154..1827c4059dea 100644 --- a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc +++ b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc @@ -146,3 +146,21 @@ TEST_F(HexagonDeviceAPITest, DISABLED_alloc_free_diff_dev) { CHECK(buf != nullptr); EXPECT_THROW(hexapi->FreeDataSpace(cpu_dev, buf), InternalError); } + +// Alloc a non-runtime buffer +// Alloc a runtime buffer +// "Release" resources for runtime +// Verify the runtime buffer cannot be freed, but the non-runtime buffer can +// This test should be run last +TEST_F(HexagonDeviceAPITest, leak_resources) { + hexapi->ReleaseResources(); + void* pre_runtime_buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8); + CHECK(pre_runtime_buf != nullptr); + hexapi->AcquireResources(); + void* runtime_buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8); + CHECK(runtime_buf != nullptr); + hexapi->ReleaseResources(); + EXPECT_THROW(hexapi->FreeDataSpace(hex_dev, runtime_buf), InternalError); + hexapi->FreeDataSpace(hex_dev, pre_runtime_buf); + hexapi->AcquireResources(); +} From 41b65a3144595afb04228be1334dc77c08d11ba7 Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Fri, 16 Sep 2022 18:11:06 -0700 Subject: [PATCH 195/704] [TVMScript] IRBuilder methods for `Block` (#12815) This PR introduces remaining IRBuilder methods for `Block`. Co-authored-by: yongwww --- include/tvm/script/ir_builder/tir/frame.h | 35 +++ include/tvm/script/ir_builder/tir/ir.h | 49 ++++ python/tvm/script/ir_builder/base.py | 18 +- python/tvm/script/ir_builder/ir/ir.py | 2 +- python/tvm/script/ir_builder/tir/frame.py | 7 +- python/tvm/script/ir_builder/tir/ir.py | 235 +++++++++++++++--- src/script/ir_builder/tir/frame.cc | 15 ++ src/script/ir_builder/tir/ir.cc | 80 ++++++ .../unittest/test_tvmscript_ir_builder_tir.py | 50 +++- tests/scripts/task_mypy.sh | 3 + 10 files changed, 442 insertions(+), 52 deletions(-) diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h index 2902b982d5a6..c76b400d96b4 100644 --- a/include/tvm/script/ir_builder/tir/frame.h +++ b/include/tvm/script/ir_builder/tir/frame.h @@ -187,6 +187,41 @@ class BlockFrame : public TIRFrame { TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(BlockFrame, TIRFrame, BlockFrameNode); }; +/*! + * \brief A frame that represents the block initialization statment. + * + * \sa BlockInitFrame + */ +class BlockInitFrameNode : public TIRFrameNode { + public: + void VisitAttrs(tvm::AttrVisitor* v) { TIRFrameNode::VisitAttrs(v); } + + static constexpr const char* _type_key = "script.ir_builder.tir.BlockInitFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(BlockInitFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when entering RAII scope. + * \sa tvm::support::With + */ + void EnterWithScope() final; + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to BlockInitFrameNode. + * + * \sa BlockInitFrameNode + */ +class BlockInitFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(BlockInitFrame, TIRFrame, BlockInitFrameNode); +}; + /*! * \brief A frame that represents the for loop. * diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h index 037606253adc..191887648dbd 100644 --- a/include/tvm/script/ir_builder/tir/ir.h +++ b/include/tvm/script/ir_builder/tir/ir.h @@ -141,6 +141,55 @@ void PreflattenedBuffer(Buffer postflattened_buffer, Array shape, */ BlockFrame Block(String name, bool no_realize = false); +/*! + * \brief The block initialization statement. + * \return The BlockInitFrame. + */ +BlockInitFrame Init(); + +/*! + * \brief The block predicate statement. + * \param predicate The predicate condition. + */ +void Where(PrimExpr predicate); + +/*! + * \brief The block buffer region reading statement. + * \param buffer_slices The array of buffer regions to read. + */ +void Reads(Array buffer_slices); + +/*! + * \brief The block buffer region writing statement. + * \param buffer_slices The array of buffer regions to write. + */ +void Writes(Array buffer_slices); + +/*! + * \brief The block annotation statement. + * \param attrs The annotation of the block. + */ +void BlockAttrs(Map attrs); + +/*! + * \brief The buffer allocation function. + * \param shape The type of the buffer prior to flattening. + * \param dtype The data type in the content of the buffer. + * \param data The pointer to the head of the data. + * \param strides The strides of each dimension. + * \param elem_offset The offset in terms of number of dtype elements (including lanes). + * \param storage_scope The optional storage scope of buffer data pointer. + * \param align The alignment requirement of data pointer in bytes. + * \param offset_factor The factor of elem_offset field. + * \param buffer_type The buffer type. + * \param axis_separators The separators between input axes when generating flattened output axes. + * \return The allocated buffer. + */ +Buffer AllocBuffer(Array shape, DataType dtype = DataType::Float(32), + Optional data = NullOpt, Array strides = {}, + PrimExpr elem_offset = PrimExpr(), String storage_scope = "", int align = -1, + int offset_factor = 0, String buffer_type = "default", + Array axis_separators = {}); namespace axis { /*! diff --git a/python/tvm/script/ir_builder/base.py b/python/tvm/script/ir_builder/base.py index 767fa8bf2596..7aa33ee49c72 100644 --- a/python/tvm/script/ir_builder/base.py +++ b/python/tvm/script/ir_builder/base.py @@ -61,11 +61,11 @@ class IRBuilderFrame(_Object): """ def __enter__(self) -> "IRBuilderFrame": - _ffi_api.IRBuilderFrameEnter(self) # pylint: disable=no-member # type: ignore + _ffi_api.IRBuilderFrameEnter(self) # type: ignore[attr-defined] # pylint: disable=no-member return self def __exit__(self, ptype, value, trace) -> None: # pylint: disable=unused-argument - _ffi_api.IRBuilderFrameExit(self) # pylint: disable=no-member # type: ignore + _ffi_api.IRBuilderFrameExit(self) # type: ignore[attr-defined] # pylint: disable=no-member def add_callback(self, callback: Callable[[], None]) -> None: """Add a callback method invoked when exiting the with-scope. @@ -75,7 +75,7 @@ def add_callback(self, callback: Callable[[], None]) -> None: callback : Callable[[], None] The callback method to be invoked. """ - _ffi_api.IRBuilderFrameAddCallback( # pylint: disable=no-member # type: ignore + _ffi_api.IRBuilderFrameAddCallback( # type: ignore[attr-defined] # pylint: disable=no-member self, callback ) @@ -104,7 +104,7 @@ class IRBuilder(_Object): def __init__(self) -> None: """Construct an IRBuilder.""" self.__init_handle_by_constructor__( - _ffi_api.IRBuilder # pylint: disable=no-member # type: ignore + _ffi_api.IRBuilder # type: ignore[attr-defined] # pylint: disable=no-member ) def __enter__(self) -> "IRBuilder": @@ -119,11 +119,11 @@ def __enter__(self) -> "IRBuilder": with IRBuilder() as builder: assert IRBuilder.current() == builder """ - _ffi_api.IRBuilderEnter(self) # pylint: disable=no-member # type: ignore + _ffi_api.IRBuilderEnter(self) # type: ignore[attr-defined] # pylint: disable=no-member return self def __exit__(self, ptype, value, trace) -> None: # pylint: disable=unused-argument - _ffi_api.IRBuilderExit(self) # pylint: disable=no-member # type: ignore + _ffi_api.IRBuilderExit(self) # type: ignore[attr-defined] # pylint: disable=no-member @staticmethod def current() -> "IRBuilder": @@ -134,11 +134,11 @@ def current() -> "IRBuilder": builder : IRBuilder The current IRBuilder. """ - return _ffi_api.IRBuilderCurrent() # pylint: disable=no-member # type: ignore + return _ffi_api.IRBuilderCurrent() # type: ignore[attr-defined] # pylint: disable=no-member def get(self) -> _Object: """Get the constructed IR.""" - return _ffi_api.IRBuilderGet(self) # pylint: disable=no-member # type: ignore + return _ffi_api.IRBuilderGet(self) # type: ignore[attr-defined] # pylint: disable=no-member @staticmethod def name(s: str, v: Any) -> Any: @@ -156,7 +156,7 @@ def name(s: str, v: Any) -> Any: v : Any The same object with the name set. """ - return _ffi_api.IRBuilderName(s, v) # pylint: disable=no-member # type: ignore + return _ffi_api.IRBuilderName(s, v) # type: ignore[attr-defined] # pylint: disable=no-member @staticmethod def name_many( # pylint: disable=invalid-name diff --git a/python/tvm/script/ir_builder/ir/ir.py b/python/tvm/script/ir_builder/ir/ir.py index df920364356b..213180463cb2 100644 --- a/python/tvm/script/ir_builder/ir/ir.py +++ b/python/tvm/script/ir_builder/ir/ir.py @@ -21,4 +21,4 @@ def ir_module() -> IRModuleFrame: - return _ffi_api.IRModule() # pylint: disable=no-member # type: ignore + return _ffi_api.IRModule() # type: ignore[attr-defined] # pylint: disable=no-member diff --git a/python/tvm/script/ir_builder/tir/frame.py b/python/tvm/script/ir_builder/tir/frame.py index 75bb0231aeef..2ad08f35160d 100644 --- a/python/tvm/script/ir_builder/tir/frame.py +++ b/python/tvm/script/ir_builder/tir/frame.py @@ -38,8 +38,13 @@ class BlockFrame(TIRFrame): ... +@_register_object("script.ir_builder.tir.BlockInitFrame") +class BlockInitFrame(TIRFrame): + ... + + @_register_object("script.ir_builder.tir.ForFrame") class ForFrame(TIRFrame): - def __enter__(self) -> Union[Var, List[Var]]: + def __enter__(self) -> Union[Var, List[Var]]: # type: ignore[override] super().__enter__() return self.vars if len(self.vars) > 1 else self.vars[0] diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py index 40cd99c744d7..d1dc1c89600d 100644 --- a/python/tvm/script/ir_builder/tir/ir.py +++ b/python/tvm/script/ir_builder/tir/ir.py @@ -25,6 +25,7 @@ Buffer, BufferLoad, BufferRegion, + IntImm, PrimExpr, StringImm, Var, @@ -85,7 +86,7 @@ def buffer_decl( The declared buffer. """ shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape - return _ffi_api.BufferDecl( # pylint: disable=no-member # type: ignore + return _ffi_api.BufferDecl( # type: ignore[attr-defined] # pylint: disable=no-member shape, dtype, "", @@ -108,7 +109,7 @@ def prim_func() -> frame.PrimFuncFrame: res : frame.PrimFuncFrame The PrimFuncFrame. """ - return _ffi_api.PrimFunc() # pylint: disable=no-member # type: ignore + return _ffi_api.PrimFunc() # type: ignore[attr-defined] # pylint: disable=no-member def arg(name: str, obj: Union[Var, Buffer]) -> Union[Var, Buffer]: @@ -127,7 +128,7 @@ def arg(name: str, obj: Union[Var, Buffer]) -> Union[Var, Buffer]: res : Union[Var, Buffer] The argument. """ - return _ffi_api.Arg(name, obj) # pylint: disable=no-member # type: ignore + return _ffi_api.Arg(name, obj) # type: ignore[attr-defined] # pylint: disable=no-member def func_name(name: str) -> None: @@ -138,7 +139,7 @@ def func_name(name: str) -> None: name : str The name of the PrimFunc. """ - _ffi_api.FuncName(name) # pylint: disable=no-member # type: ignore + _ffi_api.FuncName(name) # type: ignore[attr-defined] # pylint: disable=no-member def func_attr(attrs: Dict[str, Any]) -> None: @@ -149,7 +150,7 @@ def func_attr(attrs: Dict[str, Any]) -> None: attrs : Dict[str, Any] The annotations of the PrimFunc. """ - _ffi_api.FuncAttrs(attrs) # pylint: disable=no-member # type: ignore + _ffi_api.FuncAttrs(attrs) # type: ignore[attr-defined] # pylint: disable=no-member def func_ret(ret_type: Type) -> Type: @@ -165,7 +166,7 @@ def func_ret(ret_type: Type) -> Type: res : Type The return type. """ - return _ffi_api.FuncRet(ret_type) # pylint: disable=no-member # type: ignore + return _ffi_api.FuncRet(ret_type) # type: ignore[attr-defined] # pylint: disable=no-member def match_buffer( @@ -242,7 +243,7 @@ def match_buffer( shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape if strides is None: strides = [] - return _ffi_api.MatchBuffer( # pylint: disable=no-member # type: ignore + return _ffi_api.MatchBuffer( # type: ignore[attr-defined] # pylint: disable=no-member param, shape, dtype, @@ -310,7 +311,7 @@ def preflattened_buffer( shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape if strides is None: strides = [] - _ffi_api.PreflattenedBuffer( # pylint: disable=no-member # type: ignore + _ffi_api.PreflattenedBuffer( # type: ignore[attr-defined] # pylint: disable=no-member postflattened, shape, dtype, @@ -341,7 +342,155 @@ def block(name: str = "", no_realize: bool = False) -> frame.BlockFrame: res : frame.BlockFrame The BlockFrame. """ - return _ffi_api.Block(name, no_realize) # pylint: disable=no-member # type: ignore + return _ffi_api.Block(name, no_realize) # type: ignore[attr-defined] # pylint: disable=no-member + + +def init() -> frame.BlockInitFrame: + """The block initialization statement. + + Returns + ------- + res : frame.BlockInitFrame + The BlockInitFrame. + """ + return _ffi_api.Init() # type: ignore[attr-defined] # pylint: disable=no-member + + +def where(predicate: Union[PrimExpr, int]) -> None: + """The block predicate statement. + + Parameters + ---------- + predicate : Union[PrimExpr, Literal[0, 1]] + The predicate condition. + """ + if isinstance(predicate, bool): + predicate = IntImm("bool", predicate) + if isinstance(predicate, int): + if predicate in [0, 1]: + predicate = IntImm("bool", predicate) + else: + raise ValueError(f"Invalid value for predicate: {predicate}") + _ffi_api.Where(predicate) # type: ignore[attr-defined] # pylint: disable=no-member + + +def reads(*buffer_slices: List[Union[BufferRegion, BufferLoad]]) -> None: + """The block buffer region reading statement. + + Parameters + ---------- + buffer_slices : List[Union[BufferRegion, BufferLoad]] + The array of buffer regions to read. + """ + if len(buffer_slices) == 1: + if isinstance(buffer_slices[0], tuple): + buffer_slices = list(buffer_slices[0]) + elif isinstance(buffer_slices[0], list): + buffer_slices = buffer_slices[0] # type: ignore[assignment] + else: + buffer_slices = [buffer_slices[0]] + else: + buffer_slices = list(buffer_slices) # type: ignore[assignment] + _ffi_api.Reads(buffer_slices) # type: ignore[attr-defined] # pylint: disable=no-member + + +def writes(*buffer_slices: List[Union[BufferRegion, BufferLoad]]) -> None: + """The block buffer region writing statement. + + Parameters + ---------- + buffer_slices : List[Union[BufferRegion, BufferLoad]] + The array of buffer regions to write. + """ + if len(buffer_slices) == 1: + if isinstance(buffer_slices[0], tuple): + buffer_slices = list(buffer_slices[0]) + elif isinstance(buffer_slices[0], list): + buffer_slices = buffer_slices[0] # type: ignore[assignment] + else: + buffer_slices = [buffer_slices[0]] + else: + buffer_slices = list(buffer_slices) # type: ignore[assignment] + _ffi_api.Writes(buffer_slices) # type: ignore[attr-defined] # pylint: disable=no-member + + +def block_attr(attrs: Dict[str, Any]) -> None: + """The block annotation statement. + + Parameters + ---------- + attrs : Dict[str, Any] + The annotation of the block. + """ + return _ffi_api.BlockAttrs(attrs) # type: ignore[attr-defined] # pylint: disable=no-member + + +def alloc_buffer( + shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral], + dtype: str = "float32", + data: Var = None, + strides: List[PrimExpr] = None, + elem_offset: PrimExpr = None, + scope: str = "", + align: int = -1, + offset_factor: int = 0, + buffer_type: str = "default", + axis_separators: List[int] = None, +) -> Buffer: + """The buffer alllocation function. + + Parameters + ---------- + shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral] + The type of the buffer prior to flattening. + + dtype : str + The data type in the content of the buffer. + + data : Var + The pointer to the head of the data. + + strides : List[PrimExpr] + The strides of each dimension. + + elem_offset : PrimExpr + The offset in terms of number of dtype elements (including lanes). + + scope : str + The optional storage scope of buffer data pointer. + + align : int + The alignment requirement of data pointer in bytes. + + offset_factor : int + The factor of elem_offset field. + + buffer_type : str + The buffer type. + + axis_separators : List[int] + The separators between input axes when generating flattened output axes. + + Returns + ------- + res : Buffer + The allocated buffer. + """ + shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape + if strides is None: + strides = [] + return _ffi_api.AllocBuffer( # type: ignore[attr-defined] # pylint: disable=no-member + shape, + dtype, + data, + strides, + elem_offset, + scope, + align, + offset_factor, + buffer_type, + axis_separators, + ) def _as_range(dom: Union[Range, List[PrimExpr]]) -> Range: @@ -387,7 +536,7 @@ def spatial( res : Var The iteration variable. """ - return _ffi_api.AxisSpatial( # pylint: disable=no-member # type: ignore + return _ffi_api.AxisSpatial( # type: ignore[attr-defined] # pylint: disable=no-member _as_range(dom), binding, dtype ) @@ -413,7 +562,7 @@ def reduce( res : Var The iteration variable. """ - return _ffi_api.AxisReduce( # pylint: disable=no-member # type: ignore + return _ffi_api.AxisReduce( # type: ignore[attr-defined] # pylint: disable=no-member _as_range(dom), binding, dtype ) @@ -439,7 +588,7 @@ def scan( res : Var The iteration variable. """ - return _ffi_api.AxisScan( # pylint: disable=no-member # type: ignore + return _ffi_api.AxisScan( # type: ignore[attr-defined] # pylint: disable=no-member _as_range(dom), binding, dtype ) @@ -465,7 +614,7 @@ def opaque( res : Var The iteration variable. """ - return _ffi_api.AxisOpaque( # pylint: disable=no-member # type: ignore + return _ffi_api.AxisOpaque( # type: ignore[attr-defined] # pylint: disable=no-member _as_range(dom), binding, dtype ) @@ -489,7 +638,7 @@ def remap(kinds: str, bindings: List[PrimExpr], dtype: str = "int32") -> Union[L res : Var The iteration variables. """ - iter_vars = _ffi_api.AxisRemap( # pylint: disable=no-member # type: ignore + iter_vars = _ffi_api.AxisRemap( # type: ignore[attr-defined] # pylint: disable=no-member kinds, bindings, dtype ) return iter_vars[0] if len(iter_vars) == 1 else iter_vars @@ -522,7 +671,7 @@ def serial( if stop is None: stop = start start = 0 - return _ffi_api.Serial(start, stop, annotations) # pylint: disable=no-member # type: ignore + return _ffi_api.Serial(start, stop, annotations) # type: ignore[attr-defined] # pylint: disable=no-member def parallel( @@ -549,7 +698,7 @@ def parallel( if stop is None: stop = start start = 0 - return _ffi_api.Parallel(start, stop, annotations) # pylint: disable=no-member # type: ignore + return _ffi_api.Parallel(start, stop, annotations) # type: ignore[attr-defined] # pylint: disable=no-member def vectorized( @@ -576,7 +725,7 @@ def vectorized( if stop is None: stop = start start = 0 - return _ffi_api.Vectorized(start, stop, annotations) # pylint: disable=no-member # type: ignore + return _ffi_api.Vectorized(start, stop, annotations) # type: ignore[attr-defined] # pylint: disable=no-member def unroll( @@ -603,7 +752,7 @@ def unroll( if stop is None: stop = start start = 0 - return _ffi_api.Unroll(start, stop, annotations) # pylint: disable=no-member # type: ignore + return _ffi_api.Unroll(start, stop, annotations) # type: ignore[attr-defined] # pylint: disable=no-member def thread_binding( @@ -643,7 +792,7 @@ def thread_binding( elif stop is None: stop = start start = 0 - return _ffi_api.ThreadBinding( # pylint: disable=no-member # type: ignore + return _ffi_api.ThreadBinding( # type: ignore[attr-defined] # pylint: disable=no-member start, stop, thread, annotations ) @@ -661,7 +810,7 @@ def grid(*extents: PrimExpr) -> frame.ForFrame: res : frame.ForFrame The ForFrame. """ - return _ffi_api.Grid(extents) # pylint: disable=no-member # type: ignore + return _ffi_api.Grid(extents) # type: ignore[attr-defined] # pylint: disable=no-member def evaluate(value: PrimExpr) -> None: @@ -674,7 +823,7 @@ def evaluate(value: PrimExpr) -> None: """ if isinstance(value, str): value = StringImm(value) - return _ffi_api.Evaluate(value) # pylint: disable=no-member # type: ignore + return _ffi_api.Evaluate(value) # type: ignore[attr-defined] # pylint: disable=no-member def int8(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -690,7 +839,7 @@ def int8(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type int8 or casted expression with type int8. """ - return _ffi_api.Int8(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.Int8(expr) # type: ignore[attr-defined] # pylint: disable=no-member def int16(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -706,7 +855,7 @@ def int16(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type int16 or casted expression with type int16. """ - return _ffi_api.Int16(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.Int16(expr) # type: ignore[attr-defined] # pylint: disable=no-member def int32(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -722,7 +871,7 @@ def int32(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type int32 or casted expression with type int32. """ - return _ffi_api.Int32(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.Int32(expr) # type: ignore[attr-defined] # pylint: disable=no-member def int64(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -738,7 +887,7 @@ def int64(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type int64 or casted expression with type int64. """ - return _ffi_api.Int64(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.Int64(expr) # type: ignore[attr-defined] # pylint: disable=no-member def uint8(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -754,7 +903,7 @@ def uint8(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type uint8 or casted expression with type uint8. """ - return _ffi_api.UInt8(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.UInt8(expr) # type: ignore[attr-defined] # pylint: disable=no-member def uint16(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -770,7 +919,7 @@ def uint16(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type uint16 or casted expression with type uint16. """ - return _ffi_api.UInt16(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.UInt16(expr) # type: ignore[attr-defined] # pylint: disable=no-member def uint32(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -786,7 +935,7 @@ def uint32(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type uint32 or casted expression with type uint32. """ - return _ffi_api.UInt32(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.UInt32(expr) # type: ignore[attr-defined] # pylint: disable=no-member def uint64(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -802,7 +951,7 @@ def uint64(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type uint64 or casted expression with type uint64. """ - return _ffi_api.UInt64(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.UInt64(expr) # type: ignore[attr-defined] # pylint: disable=no-member def float8(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -818,7 +967,7 @@ def float8(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type float8 or casted expression with type float8. """ - return _ffi_api.Float8(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.Float8(expr) # type: ignore[attr-defined] # pylint: disable=no-member def float16(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -834,7 +983,7 @@ def float16(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type float16 or casted expression with type float16. """ - return _ffi_api.Float16(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.Float16(expr) # type: ignore[attr-defined] # pylint: disable=no-member def float32(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -850,7 +999,7 @@ def float32(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type float32 or casted expression with type float32. """ - return _ffi_api.Float32(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.Float32(expr) # type: ignore[attr-defined] # pylint: disable=no-member def float64(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -866,7 +1015,7 @@ def float64(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type float64 or casted expression with type float64. """ - return _ffi_api.Float64(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.Float64(expr) # type: ignore[attr-defined] # pylint: disable=no-member def int32x4(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -882,7 +1031,7 @@ def int32x4(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type int32x4 or casted expression with type int32x4. """ - return _ffi_api.Int32x4(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.Int32x4(expr) # type: ignore[attr-defined] # pylint: disable=no-member def int32x8(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -898,7 +1047,7 @@ def int32x8(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type int32x8 or casted expression with type int32x8. """ - return _ffi_api.Int32x8(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.Int32x8(expr) # type: ignore[attr-defined] # pylint: disable=no-member def int32x16(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -914,7 +1063,7 @@ def int32x16(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type int32x16 or casted expression with type int32x16. """ - return _ffi_api.Int32x16(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.Int32x16(expr) # type: ignore[attr-defined] # pylint: disable=no-member def boolean(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -930,7 +1079,7 @@ def boolean(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type boolean or casted expression with type boolean. """ - return _ffi_api.Boolean(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.Boolean(expr) # type: ignore[attr-defined] # pylint: disable=no-member def handle(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -946,7 +1095,7 @@ def handle(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type handle or casted expression with type handle. """ - return _ffi_api.Handle(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.Handle(expr) # type: ignore[attr-defined] # pylint: disable=no-member def void(expr: Optional[PrimExpr] = None) -> PrimExpr: @@ -962,7 +1111,7 @@ def void(expr: Optional[PrimExpr] = None) -> PrimExpr: res : PrimExpr The new tir.Var with type void or casted expression with type void. """ - return _ffi_api.Void(expr) # pylint: disable=no-member # type: ignore + return _ffi_api.Void(expr) # type: ignore[attr-defined] # pylint: disable=no-member def var(dtype, name="") -> Var: @@ -981,7 +1130,7 @@ def var(dtype, name="") -> Var: res : Var The result tir.Var. """ - return Var(name, dtype) # pylint: disable=no-member # type: ignore + return Var(name, dtype) # pylint: disable=no-member # pylint: enable=invalid-name @@ -997,6 +1146,12 @@ def var(dtype, name="") -> Var: "match_buffer", "preflattened_buffer", "block", + "init", + "where", + "reads", + "writes", + "block_attr", + "alloc_buffer", "axis", "serial", "parallel", diff --git a/src/script/ir_builder/tir/frame.cc b/src/script/ir_builder/tir/frame.cc index e54bf75eeff2..8b8b2a4d80e0 100644 --- a/src/script/ir_builder/tir/frame.cc +++ b/src/script/ir_builder/tir/frame.cc @@ -73,6 +73,20 @@ void BlockFrameNode::ExitWithScope() { } } +void BlockInitFrameNode::EnterWithScope() { + BlockFrame frame = FindBlockFrame("T.init"); + if (frame->init.defined()) { + LOG(FATAL) << "ValueError: Duplicate block init declaration"; + } + TIRFrameNode::EnterWithScope(); +} + +void BlockInitFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + BlockFrame frame = FindBlockFrame("T.init"); + frame->init = AsStmt(stmts); +} + void ForFrameNode::ExitWithScope() { TIRFrameNode::ExitWithScope(); AddToParent(this->f_make_for_loop(vars, doms, AsStmt(stmts))); @@ -81,6 +95,7 @@ void ForFrameNode::ExitWithScope() { TVM_REGISTER_NODE_TYPE(TIRFrameNode); TVM_REGISTER_NODE_TYPE(PrimFuncFrameNode); TVM_REGISTER_NODE_TYPE(BlockFrameNode); +TVM_REGISTER_NODE_TYPE(BlockInitFrameNode); TVM_REGISTER_NODE_TYPE(ForFrameNode); } // namespace tir diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc index 5013e321728e..75e759262655 100644 --- a/src/script/ir_builder/tir/ir.cc +++ b/src/script/ir_builder/tir/ir.cc @@ -173,6 +173,80 @@ BlockFrame Block(String name, bool no_realize) { return BlockFrame(n); } +BlockInitFrame Init() { return BlockInitFrame(make_object()); } + +void Where(PrimExpr predicate) { + BlockFrame frame = FindBlockFrame("T.where"); + if (frame->predicate.defined()) { + LOG(FATAL) << "ValueError: Duplicate block predicate declaration, previous one is " + << frame->predicate; + } + frame->predicate = predicate; +} + +void Reads(Array buffer_slices) { + using namespace tvm::tir; + BlockFrame frame = FindBlockFrame("T.reads"); + if (frame->reads.defined()) { + LOG(FATAL) << "ValueError: Duplicate read region declaration, previous one is " << frame->reads; + } + Array reads; + for (const ObjectRef& obj : buffer_slices) { + if (const auto* buffer_region = obj.as()) { + reads.push_back(GetRef(buffer_region)); + } else if (const auto* buffer_load = obj.as()) { + reads.push_back(BufferRegionFromLoad(GetRef(buffer_load))); + } else { + LOG(FATAL) << "Invalid type for buffer reads."; + } + } + frame->reads = reads; +} + +void Writes(Array buffer_slices) { + using namespace tvm::tir; + BlockFrame frame = FindBlockFrame("T.writes"); + if (frame->writes.defined()) { + LOG(FATAL) << "ValueError: Duplicate write region declaration, previous one is " + << frame->writes; + } + Array writes; + for (const ObjectRef& obj : buffer_slices) { + if (const auto* buffer_region = obj.as()) { + writes.push_back(GetRef(buffer_region)); + } else if (const auto* buffer_load = obj.as()) { + writes.push_back(BufferRegionFromLoad(GetRef(buffer_load))); + } else { + LOG(FATAL) << "Invalid type for buffer writes."; + } + } + frame->writes = writes; +} + +void BlockAttrs(Map attrs) { + BlockFrame frame = FindBlockFrame("T.block_attr"); + if (frame->annotations.defined()) { + LOG(FATAL) << "ValueError: Duplicate block annotations, previous one is " << frame->annotations; + } + frame->annotations = attrs; +} + +Buffer AllocBuffer(Array shape, DataType dtype, Optional data, + Array strides, PrimExpr elem_offset, String storage_scope, int align, + int offset_factor, String buffer_type_str, Array axis_separators) { + Buffer buffer = BufferDecl(shape, dtype, "", data, strides, elem_offset, storage_scope, align, + offset_factor, buffer_type_str, axis_separators); + IRBuilder builder = IRBuilder::Current(); + if (Optional frame = builder->GetLastFrame()) { + frame.value()->alloc_buffers.push_back(buffer); + } else if (Optional frame = builder->GetLastFrame()) { + frame.value()->root_alloc_buffers.push_back(buffer); + } else { + LOG(FATAL) << "ValueError: Block frame or PrimFunc frame not find. Please ensure " + "'T.alloc_buffer' is called under T.block() or T.prim_func()"; + } + return buffer; +} namespace axis { IterVar PushBlockVar(IterVar iter_var, PrimExpr binding) { @@ -383,6 +457,12 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.MatchBuffer").set_body_typed(MatchBuf TVM_REGISTER_GLOBAL("script.ir_builder.tir.PreflattenedBuffer").set_body_typed(PreflattenedBuffer); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Block").set_body_typed(Block); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Init").set_body_typed(Init); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Where").set_body_typed(Where); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Reads").set_body_typed(Reads); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Writes").set_body_typed(Writes); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.BlockAttrs").set_body_typed(BlockAttrs); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.AllocBuffer").set_body_typed(AllocBuffer); TVM_REGISTER_GLOBAL("script.ir_builder.tir.AxisSpatial").set_body_typed(axis::Spatial); TVM_REGISTER_GLOBAL("script.ir_builder.tir.AxisReduce").set_body_typed(axis::Reduce); diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py index d893ebc545c6..a5d8c1068064 100644 --- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py +++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py @@ -87,7 +87,7 @@ def test_ir_builder_tir_primfunc_complete(): assert_structural_equal(prim_func_actual, prim_func_expected, map_free_vars=True) -def test_ir_builder_tir_block(): +def test_ir_builder_tir_block_base(): with IRBuilder() as ib: with T.block("block"): T.evaluate(0) @@ -114,6 +114,54 @@ def test_ir_builder_tir_block(): assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True) +def test_ir_builder_tir_block_complete(): + with IRBuilder() as ib: + a = T.var("int64", "a") + b = T.buffer_decl((128, 128), "float32") + c = T.buffer_decl((128, 128), "float32") + d = T.var("int32", "d") + e = T.buffer_decl((128, 128), "float32") + f = T.var("int32", "f") + with T.block("block"): + T.where(a > 1) + T.reads(b[0:16, 0:16]) + T.writes(c[d:128, d:128]) + T.block_attr({"key": "value"}) + T.alloc_buffer((128, 128), "float32") + T.match_buffer(e[0:32, 0:32], (32, 32), "float32") + T.axis.spatial(128, f) + T.evaluate(0) + # the block generated by IRBuilder + block_realize_actual = ib.get() + + # the expected block + var_a = tir.Var("a", "int64") + buffer_b = tir.decl_buffer((128, 128), "float32", name="b") + buffer_c = tir.decl_buffer((128, 128), "float32", name="c") + var_d = tir.Var("d", "int32") + buffer_e = tir.decl_buffer((128, 128), "float32", name="c") + var_f = tir.Var("f", "int32") + block_expected = tir.Block( + iter_vars=[tir.IterVar((0, 128), tir.Var("", "int32"), iter_type=tir.IterVar.DataPar)], + reads=[buffer_b[0:16, 0:16]], + writes=[buffer_c[var_d:128, var_d:128]], + name_hint="block", + body=tir.Evaluate(0), + alloc_buffers=[tir.decl_buffer((128, 128), "float32")], + match_buffers=[ + tir.MatchBufferRegion(tir.decl_buffer((32, 32), "float32"), buffer_e[0:32, 0:32]) + ], + annotations={"key": "value"}, + ) + block_realize_expected = tir.BlockRealize( + iter_values=[var_f], + predicate=var_a > 1, + block=block_expected, + ) + # Check if the generated ir is expected + assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True) + + def test_ir_builder_tir_axis(): with IRBuilder() as ib: a = T.var("int32", "a") diff --git a/tests/scripts/task_mypy.sh b/tests/scripts/task_mypy.sh index f165adfe1bc4..c3e5d50b3e03 100755 --- a/tests/scripts/task_mypy.sh +++ b/tests/scripts/task_mypy.sh @@ -47,3 +47,6 @@ mypy --disallow-untyped-defs python/tvm/relay/op/contrib/tensorrt.py #TODO(@mikepapadim): This is failing atm # echo "Checking MyPy Type defs in the tvm.relay.backend.contrib.ethosu package." # mypy --check-untyped-defs python/tvm/relay/backend/contrib/ethosu/ + +echo "Checking MyPy Type defs in the tvmscript IRBuilder package." +mypy --check-untyped-defs python/tvm/script/ir_builder From 2cae905a727930eaaeb59085393eef1e1421fc20 Mon Sep 17 00:00:00 2001 From: Ruihang Lai Date: Fri, 16 Sep 2022 21:11:31 -0400 Subject: [PATCH 196/704] [TIR] Support pattern matching argmax/argmin generated by TOPI (#12827) This PR introduces two reducers to TIR reduction part, so that rfactor and cross-thread reduction can be applied to those functions who contains argmax/argmin computation generated by TOPI. --- src/tir/schedule/primitive/reduction.cc | 134 +++++++++------ .../unittest/test_tir_schedule_rfactor.py | 156 +++++++++++++++++- 2 files changed, 233 insertions(+), 57 deletions(-) diff --git a/src/tir/schedule/primitive/reduction.cc b/src/tir/schedule/primitive/reduction.cc index 2dc47fa15bea..dd2bcf727c40 100644 --- a/src/tir/schedule/primitive/reduction.cc +++ b/src/tir/schedule/primitive/reduction.cc @@ -297,60 +297,86 @@ StmtSRef DecomposeReduction(ScheduleState self, const StmtSRef& block_sref, */ struct ReducerRegistry { ReducerRegistry() - : reducer_getters{CreateReducerGetter( - /*n_buffers=*/1, - [](const Array& x, const Array& y) { - return Array{x[0] + y[0]}; - }, - [](const Array& values) { - return Array{make_const(values[0]->dtype, 0)}; - }), - CreateReducerGetter( - /*n_buffers=*/1, - [](const Array& x, const Array& y) { - return Array{x[0] * y[0]}; - }, - [](const Array& values) { - return Array{make_const(values[0]->dtype, 1)}; - }), - CreateReducerGetter( - /*n_buffers=*/1, - [](const Array& x, const Array& y) { - return Array{min(x[0], y[0])}; - }, - [](const Array& values) { - return Array{max_value(values[0]->dtype)}; - }), - CreateReducerGetter( - /*n_buffers=*/1, - [](const Array& x, const Array& y) { - return Array{max(x[0], y[0])}; - }, - [](const Array& values) { - return Array{min_value(values[0]->dtype)}; - }), - CreateReducerGetter( - /*n_buffers=*/2, - [](const Array& x, const Array& y) { - PrimExpr idx = Select(x[1] >= y[1], x[0], y[0]); - PrimExpr val = Select(x[1] >= y[1], x[1], y[1]); - return Array{idx, val}; - }, - [](const Array& values) { - return Array{make_const(values[0]->dtype, -1), - min_value(values[1]->dtype)}; - }), - CreateReducerGetter( - /*n_buffers=*/2, - [](const Array& x, const Array& y) { - PrimExpr idx = Select(x[1] <= y[1], x[0], y[0]); - PrimExpr val = Select(x[1] <= y[1], x[1], y[1]); - return Array{idx, val}; - }, - [](const Array& values) { - return Array{make_const(values[0]->dtype, -1), - max_value(values[1]->dtype)}; - })} {} + : reducer_getters{ + CreateReducerGetter( + /*n_buffers=*/1, + [](const Array& x, const Array& y) { + return Array{x[0] + y[0]}; + }, + [](const Array& values) { + return Array{make_const(values[0]->dtype, 0)}; + }), + CreateReducerGetter( + /*n_buffers=*/1, + [](const Array& x, const Array& y) { + return Array{x[0] * y[0]}; + }, + [](const Array& values) { + return Array{make_const(values[0]->dtype, 1)}; + }), + CreateReducerGetter( + /*n_buffers=*/1, + [](const Array& x, const Array& y) { + return Array{min(x[0], y[0])}; + }, + [](const Array& values) { + return Array{max_value(values[0]->dtype)}; + }), + CreateReducerGetter( + /*n_buffers=*/1, + [](const Array& x, const Array& y) { + return Array{max(x[0], y[0])}; + }, + [](const Array& values) { + return Array{min_value(values[0]->dtype)}; + }), + CreateReducerGetter( + /*n_buffers=*/2, + [](const Array& x, const Array& y) { + PrimExpr idx = Select(x[1] >= y[1], x[0], y[0]); + PrimExpr val = Select(x[1] >= y[1], x[1], y[1]); + return Array{idx, val}; + }, + [](const Array& values) { + return Array{make_const(values[0]->dtype, -1), + min_value(values[1]->dtype)}; + }), + CreateReducerGetter( + /*n_buffers=*/2, + [](const Array& x, const Array& y) { + PrimExpr idx = + Select(Or(greater(x[1], y[1]), And(equal(x[1], y[1]), less(x[0], y[0]))), + x[0], y[0]); + PrimExpr val = Select(greater(x[1], y[1]), x[1], y[1]); + return Array{idx, val}; + }, + [](const Array& values) { + return Array{make_const(values[0]->dtype, -1), + min_value(values[1]->dtype)}; + }), + CreateReducerGetter( + /*n_buffers=*/2, + [](const Array& x, const Array& y) { + PrimExpr idx = Select(x[1] <= y[1], x[0], y[0]); + PrimExpr val = Select(x[1] <= y[1], x[1], y[1]); + return Array{idx, val}; + }, + [](const Array& values) { + return Array{make_const(values[0]->dtype, -1), + max_value(values[1]->dtype)}; + }), + CreateReducerGetter( + /*n_buffers=*/2, + [](const Array& x, const Array& y) { + PrimExpr idx = Select( + Or(less(x[1], y[1]), And(equal(x[1], y[1]), less(x[0], y[0]))), x[0], y[0]); + PrimExpr val = Select(less(x[1], y[1]), x[1], y[1]); + return Array{idx, val}; + }, + [](const Array& values) { + return Array{make_const(values[0]->dtype, -1), + max_value(values[1]->dtype)}; + })} {} static void RegisterReducer( int n_buffers, TypedPackedFunc(Array, Array)> combiner_getter, diff --git a/tests/python/unittest/test_tir_schedule_rfactor.py b/tests/python/unittest/test_tir_schedule_rfactor.py index f6db79f3ed23..964fe772d8af 100644 --- a/tests/python/unittest/test_tir_schedule_rfactor.py +++ b/tests/python/unittest/test_tir_schedule_rfactor.py @@ -15,12 +15,10 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=missing-function-docstring,missing-module-docstring -import sys - import pytest import tvm import tvm.testing -from tvm import tir +from tvm import te, tir, topi from tvm.script import tir as T from tvm.tir.schedule.testing import verify_trace_roundtrip @@ -1133,6 +1131,128 @@ def argmin_split_rfactor( argmin_v1[i] = v_argmin_v1 +@T.prim_func +def argmax_topi_rfactor( + placeholder: T.Buffer[(1, 32), "int32"], placeholder_red: T.Buffer[1, "int32"] +) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + placeholder_red_temp_v0 = T.alloc_buffer([1], dtype="int32") + placeholder_red_temp_v1 = T.alloc_buffer([1], dtype="int32") + placeholder_red_temp_v0_rf = T.alloc_buffer([1, 8], dtype="int32") + placeholder_red_temp_v1_rf = T.alloc_buffer([1, 8], dtype="int32") + for i0, i1_0, i1_1 in T.grid(1, 4, 8): + with T.block("placeholder_red_temp_rf"): + vi1_1, ax0, vi1_0 = T.axis.remap("SSR", [i1_1, i0, i1_0]) + T.reads(placeholder[ax0, vi1_0 * 8 + vi1_1]) + T.writes(placeholder_red_temp_v0_rf[ax0, vi1_1], placeholder_red_temp_v1_rf[ax0, vi1_1]) + with T.init(): + placeholder_red_temp_v0_rf[ax0, vi1_1] = -1 + placeholder_red_temp_v1_rf[ax0, vi1_1] = -2147483648 + v_placeholder_red_temp_v0_rf: T.int32 = T.Select( + placeholder_red_temp_v1_rf[ax0, vi1_1] > placeholder[ax0, vi1_0 * 8 + vi1_1] + or placeholder_red_temp_v1_rf[ax0, vi1_1] == placeholder[ax0, vi1_0 * 8 + vi1_1] + and placeholder_red_temp_v0_rf[ax0, vi1_1] < vi1_0 * 8 + vi1_1, + placeholder_red_temp_v0_rf[ax0, vi1_1], + vi1_0 * 8 + vi1_1, + ) + v_placeholder_red_temp_v1_rf: T.int32 = T.Select( + placeholder_red_temp_v1_rf[ax0, vi1_1] > placeholder[ax0, vi1_0 * 8 + vi1_1], + placeholder_red_temp_v1_rf[ax0, vi1_1], + placeholder[ax0, vi1_0 * 8 + vi1_1], + ) + placeholder_red_temp_v0_rf[ax0, vi1_1] = v_placeholder_red_temp_v0_rf + placeholder_red_temp_v1_rf[ax0, vi1_1] = v_placeholder_red_temp_v1_rf + for i0, i1_1 in T.grid(1, 8): + with T.block("placeholder_red_temp"): + vi1_1, ax0 = T.axis.remap("RS", [i1_1, i0]) + T.reads(placeholder_red_temp_v0_rf[ax0, vi1_1], placeholder_red_temp_v1_rf[ax0, vi1_1]) + T.writes(placeholder_red_temp_v0[ax0], placeholder_red_temp_v1[ax0]) + with T.init(): + placeholder_red_temp_v0[ax0] = -1 + placeholder_red_temp_v1[ax0] = -2147483648 + v_placeholder_red_temp_v0: T.int32 = T.Select( + placeholder_red_temp_v1[ax0] > placeholder_red_temp_v1_rf[ax0, vi1_1] + or placeholder_red_temp_v1[ax0] == placeholder_red_temp_v1_rf[ax0, vi1_1] + and placeholder_red_temp_v0[ax0] < placeholder_red_temp_v0_rf[ax0, vi1_1], + placeholder_red_temp_v0[ax0], + placeholder_red_temp_v0_rf[ax0, vi1_1], + ) + v_placeholder_red_temp_v1: T.int32 = T.Select( + placeholder_red_temp_v1[ax0] > placeholder_red_temp_v1_rf[ax0, vi1_1], + placeholder_red_temp_v1[ax0], + placeholder_red_temp_v1_rf[ax0, vi1_1], + ) + placeholder_red_temp_v0[ax0] = v_placeholder_red_temp_v0 + placeholder_red_temp_v1[ax0] = v_placeholder_red_temp_v1 + for i0 in T.serial(1): + with T.block("placeholder_red"): + ax0 = T.axis.spatial(1, i0) + T.reads(placeholder_red_temp_v0[ax0]) + T.writes(placeholder_red[ax0]) + placeholder_red[ax0] = placeholder_red_temp_v0[ax0] + + +@T.prim_func +def argmin_topi_rfactor( + placeholder: T.Buffer[(1, 32), "int32"], placeholder_red: T.Buffer[1, "int32"] +) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + placeholder_red_temp_v0 = T.alloc_buffer([1], dtype="int32") + placeholder_red_temp_v1 = T.alloc_buffer([1], dtype="int32") + placeholder_red_temp_v0_rf = T.alloc_buffer([1, 8], dtype="int32") + placeholder_red_temp_v1_rf = T.alloc_buffer([1, 8], dtype="int32") + for i0, i1_0, i1_1 in T.grid(1, 4, 8): + with T.block("placeholder_red_temp_rf"): + vi1_1, ax0, vi1_0 = T.axis.remap("SSR", [i1_1, i0, i1_0]) + T.reads(placeholder[ax0, vi1_0 * 8 + vi1_1]) + T.writes(placeholder_red_temp_v0_rf[ax0, vi1_1], placeholder_red_temp_v1_rf[ax0, vi1_1]) + with T.init(): + placeholder_red_temp_v0_rf[ax0, vi1_1] = -1 + placeholder_red_temp_v1_rf[ax0, vi1_1] = 2147483647 + v_placeholder_red_temp_v0_rf: T.int32 = T.Select( + placeholder_red_temp_v1_rf[ax0, vi1_1] < placeholder[ax0, vi1_0 * 8 + vi1_1] + or placeholder_red_temp_v1_rf[ax0, vi1_1] == placeholder[ax0, vi1_0 * 8 + vi1_1] + and placeholder_red_temp_v0_rf[ax0, vi1_1] < vi1_0 * 8 + vi1_1, + placeholder_red_temp_v0_rf[ax0, vi1_1], + vi1_0 * 8 + vi1_1, + ) + v_placeholder_red_temp_v1_rf: T.int32 = T.Select( + placeholder_red_temp_v1_rf[ax0, vi1_1] < placeholder[ax0, vi1_0 * 8 + vi1_1], + placeholder_red_temp_v1_rf[ax0, vi1_1], + placeholder[ax0, vi1_0 * 8 + vi1_1], + ) + placeholder_red_temp_v0_rf[ax0, vi1_1] = v_placeholder_red_temp_v0_rf + placeholder_red_temp_v1_rf[ax0, vi1_1] = v_placeholder_red_temp_v1_rf + for i0, i1_1 in T.grid(1, 8): + with T.block("placeholder_red_temp"): + vi1_1, ax0 = T.axis.remap("RS", [i1_1, i0]) + T.reads(placeholder_red_temp_v0_rf[ax0, vi1_1], placeholder_red_temp_v1_rf[ax0, vi1_1]) + T.writes(placeholder_red_temp_v0[ax0], placeholder_red_temp_v1[ax0]) + with T.init(): + placeholder_red_temp_v0[ax0] = -1 + placeholder_red_temp_v1[ax0] = 2147483647 + v_placeholder_red_temp_v0: T.int32 = T.Select( + placeholder_red_temp_v1[ax0] < placeholder_red_temp_v1_rf[ax0, vi1_1] + or placeholder_red_temp_v1[ax0] == placeholder_red_temp_v1_rf[ax0, vi1_1] + and placeholder_red_temp_v0[ax0] < placeholder_red_temp_v0_rf[ax0, vi1_1], + placeholder_red_temp_v0[ax0], + placeholder_red_temp_v0_rf[ax0, vi1_1], + ) + v_placeholder_red_temp_v1: T.int32 = T.Select( + placeholder_red_temp_v1[ax0] < placeholder_red_temp_v1_rf[ax0, vi1_1], + placeholder_red_temp_v1[ax0], + placeholder_red_temp_v1_rf[ax0, vi1_1], + ) + placeholder_red_temp_v0[ax0] = v_placeholder_red_temp_v0 + placeholder_red_temp_v1[ax0] = v_placeholder_red_temp_v1 + for i0 in T.serial(1): + with T.block("placeholder_red"): + ax0 = T.axis.spatial(1, i0) + T.reads(placeholder_red_temp_v0[ax0]) + T.writes(placeholder_red[ax0]) + placeholder_red[ax0] = placeholder_red_temp_v0[ax0] + + # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg @@ -1490,5 +1610,35 @@ def test_reduction_rfactor_argmax_init_buffer_not_match(): s.rfactor(ki, 1) +def test_reduction_rfactor_topi_argmax(): + A = te.placeholder((1, 32), dtype="int32") + B = topi.argmax(A, axis=1) + argmax_topi = te.create_prim_func([A, B]) + s = tir.Schedule(argmax_topi, debug_mask="all") + argmax = s.get_block("placeholder_red_temp") + _, k = s.get_loops(argmax) + _, ki = s.split(k, [None, 8]) + rf_block = s.rfactor(ki, 1) + tvm.ir.assert_structural_equal(s.mod["main"], argmax_topi_rfactor) + assert s.get(rf_block).same_as(s.get(s.get_block("placeholder_red_temp_rf"))) + assert s.get(argmax).same_as(s.get(s.get_block("placeholder_red_temp"))) + verify_trace_roundtrip(s, mod=argmax_topi) + + +def test_reduction_rfactor_topi_argmin(): + A = te.placeholder((1, 32), dtype="int32") + B = topi.argmin(A, axis=1) + argmin_topi = te.create_prim_func([A, B]) + s = tir.Schedule(argmin_topi, debug_mask="all") + argmin = s.get_block("placeholder_red_temp") + _, k = s.get_loops(argmin) + _, ki = s.split(k, [None, 8]) + rf_block = s.rfactor(ki, 1) + tvm.ir.assert_structural_equal(s.mod["main"], argmin_topi_rfactor) + assert s.get(rf_block).same_as(s.get(s.get_block("placeholder_red_temp_rf"))) + assert s.get(argmin).same_as(s.get(s.get_block("placeholder_red_temp"))) + verify_trace_roundtrip(s, mod=argmin_topi) + + if __name__ == "__main__": tvm.testing.main() From 91cce56cfa697a6a2e097bbae1c67ace22ef8af3 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Fri, 16 Sep 2022 18:13:58 -0700 Subject: [PATCH 197/704] [TIR] Construct the inverse in SuggestIndexMap (#12797) Computing the inverse mapping requires arithmetic analysis which is not guaranteed to cover all cases. We provide the pre-defined inverse index map instead. --- include/tvm/tir/index_map.h | 26 +++++++++- python/tvm/tir/function.py | 46 ++++++++++++++--- src/tir/ir/index_map.cc | 47 +++++++++++++++--- src/tir/schedule/analysis/layout.cc | 49 ++++++++++++++++--- .../unittest/test_tir_schedule_analysis.py | 41 ++++++++++++++++ 5 files changed, 188 insertions(+), 21 deletions(-) diff --git a/include/tvm/tir/index_map.h b/include/tvm/tir/index_map.h index f461c5640bb0..8a176cb3cee8 100644 --- a/include/tvm/tir/index_map.h +++ b/include/tvm/tir/index_map.h @@ -70,6 +70,18 @@ class IndexMapNode : public Object { */ Array final_indices; + /*! + * \brief The inverse index map. + * + * When this is defined, IndexMap::Inverse will return the pre-defined inverse index map. + * Otherwise, the inverse index map will be computed on the fly. + * It is the user's responsibility to ensure the correctness of the pre-defined inverse index + * map. + * + * \note ObjectRef is used here instead of IndexMap to avoid circular reference. + */ + Optional inverse_index_map; + /*! * \brief Default constructor * @@ -133,6 +145,7 @@ class IndexMapNode : public Object { void VisitAttrs(AttrVisitor* v) { v->Visit("initial_indices", &initial_indices); v->Visit("final_indices", &final_indices); + v->Visit("inverse_index_map", &inverse_index_map); } bool SEqualReduce(const IndexMapNode* other, SEqualReducer equal) const { @@ -153,15 +166,24 @@ class IndexMapNode : public Object { class IndexMap : public ObjectRef { public: - IndexMap(Array initial_indices, Array final_indices); + /*! + * \brief The constructor + * \param initial_indices Variables representing the indices prior to remapping + * \param final_indices Expressions defining the indices after remapping. + * \param inverse_index_map The optional pre-defined inverse index map + */ + IndexMap(Array initial_indices, Array final_indices, + Optional inverse_index_map = NullOpt); /*! * \brief Create an index map from a packed function * \param ndim The number of dimensions * \param func The function to be applied + * \param inverse_index_map The optional pre-defined inverse index map * \return The created index map */ - static IndexMap FromFunc(int ndim, runtime::TypedPackedFunc(Array)> func); + static IndexMap FromFunc(int ndim, runtime::TypedPackedFunc(Array)> func, + Optional inverse_index_map = NullOpt); /*! \brief Generate the inverse mapping. * diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py index 12c8053e39cc..e525fc2cc31a 100644 --- a/python/tvm/tir/function.py +++ b/python/tvm/tir/function.py @@ -271,6 +271,12 @@ class IndexMap(Object): Variables representing the indices prior to remapping. final_indices : List[PrimExpr] Expressions defining the indices after remapping. + inverse_index_map : Union[Callable, Optional[IndexMap]] + The optional pre-defined inverse index map. + When this is defined, IndexMap::Inverse will return the pre-defined inverse index map. + Otherwise, the inverse index map will be computed on the fly. + It is the user's responsibility to ensure the correctness of the pre-defined inverse + index map. """ initial_indices: List[Var] @@ -281,11 +287,19 @@ class IndexMap(Object): # Stage.transform_layout for more details. AXIS_SEPARATOR = "axis_separator" - def __init__(self, initial_indices, final_indices): - self.__init_handle_by_constructor__(_ffi_api.IndexMap, initial_indices, final_indices) + def __init__(self, initial_indices, final_indices, inverse_index_map): + if isinstance(inverse_index_map, Callable): + inverse_index_map = IndexMap.from_func(inverse_index_map) + self.__init_handle_by_constructor__( + _ffi_api.IndexMap, initial_indices, final_indices, inverse_index_map + ) @staticmethod - def from_func(mapping_function: Callable, ndim: Optional[int] = None): + def from_func( + mapping_function: Callable, + ndim: Optional[int] = None, + inverse_index_map: Union[Callable, Optional["IndexMap"]] = None, + ): """Create an index map from a function Parameters @@ -305,6 +319,13 @@ def from_func(mapping_function: Callable, ndim: Optional[int] = None): mapping_function does not use variadic arguments, ndim is optional. + inverse_index_map : Union[Callable, Optional[IndexMap]] + The optional pre-defined inverse index map. + When this is defined, IndexMap::Inverse will return the pre-defined inverse index map. + Otherwise, the inverse index map will be computed on the fly. + It is the user's responsibility to ensure the correctness of the pre-defined inverse + index map. + Returns ------- index_map: IndexMap @@ -312,7 +333,9 @@ def from_func(mapping_function: Callable, ndim: Optional[int] = None): Returns an IndexMap representing the `mapping_function`. """ - index_map, axis_separators = IndexMap.from_func_with_separators(mapping_function, ndim) + index_map, axis_separators = IndexMap.from_func_with_separators( + mapping_function, ndim, inverse_index_map + ) assert not axis_separators, ( "The mapping_function provided to IndexMap.from_func " "may not return IndexMap.AXIS_SEPARATOR. " @@ -321,7 +344,11 @@ def from_func(mapping_function: Callable, ndim: Optional[int] = None): return index_map @staticmethod - def from_func_with_separators(mapping_function: Callable, ndim: Optional[int] = None): + def from_func_with_separators( + mapping_function: Callable, + ndim: Optional[int] = None, + inverse_index_map: Union[Callable, Optional["IndexMap"]] = None, + ): """Create an index map from a function Parameters @@ -341,6 +368,13 @@ def from_func_with_separators(mapping_function: Callable, ndim: Optional[int] = mapping_function does not use variadic arguments, ndim is optional. + inverse_index_map : Union[Callable, Optional[IndexMap]] + The optional pre-defined inverse index map. + When this is defined, IndexMap::Inverse will return the pre-defined inverse index map. + Otherwise, the inverse index map will be computed on the fly. + It is the user's responsibility to ensure the correctness of the pre-defined inverse + index map. + Returns ------- ret: Tuple[IndexMap, List[int]] @@ -401,7 +435,7 @@ def from_func_with_separators(mapping_function: Callable, ndim: Optional[int] = f"Instead received {val} of type {type(val)}." ) - return IndexMap(initial_indices, final_indices), axis_separators + return IndexMap(initial_indices, final_indices, inverse_index_map), axis_separators def is_equivalent_to(self, other_map: "IndexMap") -> bool: """Return if the index maps are equivalent. diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc index 0e3c3b2774c8..cceff72ec82f 100644 --- a/src/tir/ir/index_map.cc +++ b/src/tir/ir/index_map.cc @@ -34,20 +34,23 @@ namespace tvm { namespace tir { -IndexMap::IndexMap(Array initial_indices, Array final_indices) { +IndexMap::IndexMap(Array initial_indices, Array final_indices, + Optional inverse_index_map) { auto n = make_object(); n->initial_indices = std::move(initial_indices); n->final_indices = std::move(final_indices); + n->inverse_index_map = std::move(inverse_index_map); data_ = std::move(n); } -IndexMap IndexMap::FromFunc(int ndim, runtime::TypedPackedFunc(Array)> func) { +IndexMap IndexMap::FromFunc(int ndim, runtime::TypedPackedFunc(Array)> func, + Optional inverse_index_map) { Array initial_indices; initial_indices.reserve(ndim); for (int i = 0; i < ndim; ++i) { initial_indices.push_back(Var("i" + std::to_string(i), DataType::Int(32))); } - return IndexMap(initial_indices, func(initial_indices)); + return IndexMap(initial_indices, func(initial_indices), std::move(inverse_index_map)); } std::pair IndexMap::NonSurjectiveInverse(Array initial_ranges) const { @@ -114,6 +117,10 @@ std::pair IndexMap::NonSurjectiveInverse(Array initia } IndexMap IndexMap::Inverse(Array initial_ranges) const { + if ((*this)->inverse_index_map.defined()) { + // return the pre-defined inverse index map if exists. + return Downcast((*this)->inverse_index_map.value()); + } // Dummy variables to represent the inverse's inputs. Array output_vars; for (size_t i = 0; i < (*this)->final_indices.size(); i++) { @@ -232,7 +239,14 @@ Array IndexMapNode::MapShape(const Array& shape, return output; } -String IndexMapNode::ToPythonString() const { +/*! + * \brief Auxilarry function to comvert an index map to lambda expression in Python. + * \param initial_indices The initial indices in the index map. + * \param final_indices The final indices in the index map. + * \return The lambda expression string. + */ +std::string IndexMap2PythonLambdaExpr(const Array& initial_indices, + const Array& final_indices) { std::unordered_set used_names; Map var_remap; for (const Var& initial_index : initial_indices) { @@ -259,10 +273,28 @@ String IndexMapNode::ToPythonString() const { } oss << ": ("; for (size_t i = 0; i < final_indices.size(); ++i) { + if (i != 0) { + oss << " "; + } oss << Substitute(final_indices[i], var_remap); - oss << ", "; + oss << ","; } oss << ")"; + return oss.str(); +} + +String IndexMapNode::ToPythonString() const { + std::string lambda_expr = IndexMap2PythonLambdaExpr(initial_indices, final_indices); + if (!inverse_index_map.defined()) { + return String(lambda_expr); + } + // Also convert the inverse index map. + IndexMap inverse = Downcast(inverse_index_map.value()); + std::string inverse_lambda_expr = + IndexMap2PythonLambdaExpr(inverse->initial_indices, inverse->final_indices); + std::ostringstream oss; + oss << "tvm.tir.IndexMap.from_func(" << lambda_expr + << ", inverse_index_map=" << inverse_lambda_expr << ")"; return String(oss.str()); } @@ -275,8 +307,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) TVM_REGISTER_NODE_TYPE(IndexMapNode); TVM_REGISTER_GLOBAL("tir.IndexMap") - .set_body_typed([](Array initial_indices, Array final_indices) { - return IndexMap(initial_indices, final_indices); + .set_body_typed([](Array initial_indices, Array final_indices, + Optional inverse_index_map) { + return IndexMap(initial_indices, final_indices, inverse_index_map); }); TVM_REGISTER_GLOBAL("tir.IndexMapMapIndices") diff --git a/src/tir/schedule/analysis/layout.cc b/src/tir/schedule/analysis/layout.cc index b0cafac3151f..b071b2d7e4a1 100644 --- a/src/tir/schedule/analysis/layout.cc +++ b/src/tir/schedule/analysis/layout.cc @@ -167,20 +167,25 @@ Optional SuggestIndexMap(const Buffer& buffer, const Array& } return a.lower_factor > b.lower_factor; }); + // Compute the inverse permutation by argsort + std::vector inverse_order = order; + std::sort(inverse_order.begin(), inverse_order.end(), + [&order](int _a, int _b) -> bool { return order[_a] < order[_b]; }); // Step 5. Create the indexing mapping auto f_alter_layout = [f_flatten_index = std::move(f_flatten_index), // - split_exprs = std::move(split_exprs), // - order = std::move(order), // - shape = buffer->shape, // + &split_exprs, // + &order, // + & shape = buffer->shape, // analyzer // ](Array indices) -> Array { ICHECK_EQ(indices.size(), shape.size()); for (int i = 0, n = indices.size(); i < n; ++i) { analyzer->Bind(indices[i], Range::FromMinExtent(0, shape[i])); } + // Step 5.1: Fuse all indices into a flattened one PrimExpr index = f_flatten_index({indices.begin(), indices.end()}); int ndim = split_exprs.size(); - // Step 5.1. Split the flattened index according to `split_exprs` + // Step 5.2. Split the flattened index according to `split_exprs` std::vector split; split.reserve(ndim); for (int i = ndim - 1; i >= 0; --i) { @@ -190,7 +195,7 @@ Optional SuggestIndexMap(const Buffer& buffer, const Array& index = floordiv(index, extent); } std::reverse(split.begin(), split.end()); - // Step 5.2. Reorder the indexing pattern according to `order` + // Step 5.3. Reorder the indexing pattern according to `order` Array results; results.reserve(ndim); for (int i = 0; i < ndim; ++i) { @@ -198,7 +203,39 @@ Optional SuggestIndexMap(const Buffer& buffer, const Array& } return results; }; - return IndexMap::FromFunc(ndim, f_alter_layout); + // Step 6: Create the inverse index mapping. + auto f_inverse = [&inverse_order, &split_exprs, &shape = buffer->shape, + analyzer](Array indices) -> Array { + ICHECK_EQ(indices.size(), split_exprs.size()); + // Step 6.1: Reorder the indices according to `inverse_order`. This is the inverse of Step 5.3. + // After the inverse permutation, indices[i] corresponds to split_exprs[i] + Array inv_permuted_indices; + inv_permuted_indices.reserve(indices.size()); + for (int i = 0, n = indices.size(); i < n; ++i) { + const Var& index = indices[inverse_order[i]]; + inv_permuted_indices.push_back(index); + analyzer->Bind(index, Range::FromMinExtent(0, Integer(split_exprs[i].extent))); + } + + // Step 6.2: Fuse all the indices. This is the inverse of Step 5.2. + PrimExpr flattened_index = make_const(indices[0]->dtype, 0); + int64_t stride = 1; + for (int i = static_cast(split_exprs.size()) - 1; i >= 0; --i) { + flattened_index = inv_permuted_indices[i] * Integer(stride) + flattened_index; + stride *= split_exprs[i].extent; + } + // Step 6.3: Split the flattened index into multiple indices. This is the inverse of Step 5.1. + Array result; + result.reserve(shape.size()); + for (int i = static_cast(shape.size()) - 1; i >= 0; --i) { + PrimExpr index = analyzer->Simplify(floormod(flattened_index, shape[i])); + flattened_index = floordiv(flattened_index, shape[i]); + result.push_back(index); + } + return Array(result.rbegin(), result.rend()); + }; + IndexMap inverse_index_map = IndexMap::FromFunc(split_exprs.size(), f_inverse); + return IndexMap::FromFunc(ndim, f_alter_layout, inverse_index_map); } TVM_REGISTER_GLOBAL("tir.schedule.SuggestIndexMap") diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py index 5524abbaf094..378e5183b49c 100644 --- a/tests/python/unittest/test_tir_schedule_analysis.py +++ b/tests/python/unittest/test_tir_schedule_analysis.py @@ -101,6 +101,47 @@ def test_suggest_index_map_bijective(): assert index_map.is_equivalent_to(expected_index_map) +def test_suggest_index_map_winograd(): + """use case in winograd conv where the indices are complicated""" + fused_outer, i3_3_fused, i4_0, i4_1 = _make_vars("fused_outer", "i3_3_fused", "i4_0", "i4_1") + eps = floordiv(fused_outer, 336) * 2 + floordiv(floormod(fused_outer, 16), 8) + nu = floordiv(floormod(fused_outer, 336), 112) * 2 + floordiv(floormod(fused_outer, 8), 4) + co = floormod(fused_outer, 4) * 32 + i3_3_fused + ci = (i4_0 * 32) + i4_1 + buffer = decl_buffer(shape=[6, 6, 128, 128]) + index_map = suggest_index_map( + buffer=buffer, + indices=[eps, nu, co, ci], + loops=_make_loops( + loop_vars=[fused_outer, i3_3_fused, i4_0, i4_1], + extents=[1008, 32, 4, 32], + ), + predicate=True, + ) + expected_index_map = IndexMap.from_func( + lambda i0, i1, i2, i3: ( + floordiv(i0, 2), + floordiv(i1, 2), + floormod(i0, 2), + floormod(((i1 * 4) + floordiv(i2, 32)), 8), + floormod(i2, 32), + floordiv(i3, 32), + floormod(i3, 32), + ) + ) + assert index_map.is_equivalent_to(expected_index_map) + inverse_index_map = index_map.inverse(buffer.shape) + expected_inverse_index_map = IndexMap.from_func( + lambda i0, i1, i2, i3, i4, i5, i6: ( + ((i0 * 2) + i2), + ((i1 * 2) + floordiv(((i3 * 32) + i4), 128)), + floormod(((i3 * 32) + i4), 128), + ((i5 * 32) + i6), + ) + ) + assert inverse_index_map.is_equivalent_to(expected_inverse_index_map) + + @tvm.script.ir_module class DenseVNNIModule: @T.prim_func From e92f5d43f334752d4928764aa7203f229a07bd9b Mon Sep 17 00:00:00 2001 From: Ruihang Lai Date: Sat, 17 Sep 2022 11:08:34 -0400 Subject: [PATCH 198/704] [BugFix][TIR] Fix Buffer LCA Detector (#12819) Prior to this PR, the LCA detector of buffers in TIR didn't take buffer memory scopes and GPU hierarchy into consideration. An consequent issue is that, when an intermediate buffer is in global memory, TIR's lowering passes don't necessarily allocated the intermediate buffer outside all `blockIdx`. As a result, the global intermediate buffer is allocated under a GPU thread block, which is illegal. This PR fixes this issue by fixing the LCA detector, making it be aware of the buffer memory scopes and GPU hierarchy. With this fix, the global intermediate buffers are all allocated outside `blockIdx`. --- .../analysis/buffer_access_lca_detector.cc | 45 ++++++++++++++++++- ...t_tir_analysis_detect_buffer_access_lca.py | 26 +++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/src/tir/analysis/buffer_access_lca_detector.cc b/src/tir/analysis/buffer_access_lca_detector.cc index b71e6b27f486..7197e1ba83c5 100644 --- a/src/tir/analysis/buffer_access_lca_detector.cc +++ b/src/tir/analysis/buffer_access_lca_detector.cc @@ -25,6 +25,7 @@ #include #include +#include "../../runtime/thread_storage_scope.h" #include "../../support/arena.h" namespace tvm { @@ -32,7 +33,11 @@ namespace tir { /*! * \brief Detect the lowest common ancestor(LCA) position of Buffer access. - * \note Only consider BlockNode and ForNode to be the LCA nodes. + * \note + * - Only consider BlockNode and ForNode to be the LCA nodes. + * - In the LCA locator, we are aware of the buffer scope and CUDA hierarchy so that any buffer in + * global memory will have its buffer access LCA outside all launch sites of `blockIdx`, in order to + * prevent conflicts between buffer memory scopes and CUDA hierarchy. */ class LCADetector : public StmtExprVisitor { public: @@ -51,6 +56,8 @@ class LCADetector : public StmtExprVisitor { detector.ancestor_scopes_.push_back(&root); detector(func->body); + detector.UpdateWithBlockidx(); + // Prepare the return Map> buffer_lca; for (const auto& kv : detector.buffer_lca_) { @@ -82,6 +89,15 @@ class LCADetector : public StmtExprVisitor { int n = ancestor_scopes_.size(); const ScopeInfo* parent_scope = ancestor_scopes_.back(); auto* current_scope = arena_.make(parent_scope, op, n); + + if (op->thread_binding.defined()) { + const runtime::ThreadScope& scope = + runtime::ThreadScope::Create(op->thread_binding.value()->thread_tag); + if (scope.rank == 0) { + blockidx_scopes_.push_back(current_scope); + } + } + ancestor_scopes_.push_back(current_scope); StmtExprVisitor::VisitStmt_(op); ancestor_scopes_.pop_back(); @@ -107,6 +123,18 @@ class LCADetector : public StmtExprVisitor { ancestor_scopes_.pop_back(); } + void VisitStmt_(const AttrStmtNode* op) final { + if (op->attr_key == attr::thread_extent) { + const auto* iter = op->node.as(); + ICHECK_NOTNULL(iter); + const runtime::ThreadScope& scope = runtime::ThreadScope::Create(iter->thread_tag); + if (scope.rank == 0) { + blockidx_scopes_.push_back(ancestor_scopes_.back()); + } + } + StmtExprVisitor::VisitStmt_(op); + } + void VisitExpr_(const BufferLoadNode* op) final { UpdateBufferLCA(op->buffer.get()); StmtExprVisitor::VisitExpr_(op); @@ -150,6 +178,19 @@ class LCADetector : public StmtExprVisitor { } } + void UpdateWithBlockidx() { + for (const auto& it : buffer_lca_) { + const runtime::StorageScope& scope = + runtime::StorageScope::Create(GetRef(it.first).scope()); + if (scope.rank == runtime::StorageRank::kGlobal) { + const ScopeInfo*& lca = buffer_lca_[it.first]; + for (const ScopeInfo* blockidx_scope : blockidx_scopes_) { + lca = LowestCommonAncestor(lca, blockidx_scope); + } + } + } + } + static const ScopeInfo* LowestCommonAncestor(const ScopeInfo* lhs, const ScopeInfo* rhs) { if (lhs == nullptr) return rhs; if (rhs == nullptr) return lhs; @@ -186,6 +227,8 @@ class LCADetector : public StmtExprVisitor { std::unordered_map buffer_var_map_ = {}; /*! \brief The match buffers inside blocks. */ std::unordered_set match_buffers_ = {}; + /*! \brief The ForNodes/BlockNodes which contain immediate `blockIdx` launch. */ + std::vector blockidx_scopes_ = {}; /*! \brief Internal arena. */ support::Arena arena_; }; diff --git a/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py b/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py index 344f37a23677..d438427e1fe1 100644 --- a/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py +++ b/tests/python/unittest/test_tir_analysis_detect_buffer_access_lca.py @@ -93,6 +93,19 @@ def match_buffer_func(a: T.handle, b: T.handle) -> None: T.evaluate(B1.data) +@T.prim_func +def global_buffer_with_blockidx( + a: T.Buffer[(1, 32), "int32"], b: T.Buffer[(1, 32), "int32"] +) -> None: + for i0 in T.thread_binding(0, 1, thread="blockIdx.x"): + for i1 in T.thread_binding(0, 32, thread="threadIdx.x"): + with T.block("copy"): + i, j = T.axis.remap("SS", [i0, i1]) + T.reads(a[i, j]) + T.writes(b[i, j]) + b[i, j] = a[i, j] + + def test_buffer_load_store(): func = buffer_load_store_func A, B = [func.buffer_map[x] for x in func.params] @@ -154,8 +167,21 @@ def test_match_buffer(): assert lca[B] == block +def test_global_buffer_with_blockidx(): + func = global_buffer_with_blockidx + A, B = [func.buffer_map[x] for x in func.params] + lca = tir.analysis.detect_buffer_access_lca(func) + + root_block = func.body.block + blockidx_loop = root_block.body + # LCA of both A and B should be the loop bound to `blockIdx` + assert lca[A] == blockidx_loop + assert lca[B] == blockidx_loop + + if __name__ == "__main__": test_buffer_load_store() test_opaque_access() test_lca_func_root() test_match_buffer() + test_global_buffer_with_blockidx() From 1ecf084eecaff167967df1a8c998de72e1198c24 Mon Sep 17 00:00:00 2001 From: Lite Ye Date: Sat, 17 Sep 2022 16:54:01 -0400 Subject: [PATCH 199/704] [TVMScript] Add more helper functions to the printer infra (#12829) This PR is split from https://github.com/apache/tvm/pull/12492, to make the necessary updates to the printer infra for future PRs of TIR printer. Tracking issue: https://github.com/apache/tvm/issues/11912 Co-authored-by: Greg Bonik --- include/tvm/script/printer/doc.h | 64 +++++++++++++ .../script/printer/traced_object_functor.h | 37 +------- include/tvm/script/printer/var_table.h | 11 +++ src/script/printer/doc.cc | 30 ++++-- src/script/printer/ir_docsifier.cc | 2 +- src/script/printer/utils.h | 93 +++++++++++++++++++ src/script/printer/var_table.cc | 3 +- .../cpp/tvmscript_printer_irdocsifier_test.cc | 13 ++- ...ript_printer_traced_object_functor_test.cc | 37 ++++---- 9 files changed, 228 insertions(+), 62 deletions(-) create mode 100644 src/script/printer/utils.h diff --git a/include/tvm/script/printer/doc.h b/include/tvm/script/printer/doc.h index 72f343354b1b..1ee7fd6a7fd4 100644 --- a/include/tvm/script/printer/doc.h +++ b/include/tvm/script/printer/doc.h @@ -22,6 +22,7 @@ #include #include #include +#include namespace tvm { namespace script { @@ -87,6 +88,15 @@ class ExprDocNode : public DocNode { */ ExprDoc Attr(String attr) const; + /*! + * \brief Create a doc representing attribute access on the current ExprDoc + * \param attr The attribute to access. + * + * The ObjectPath of attr will be pushed to the source_path of the returned + * doc. + */ + ExprDoc Attr(TracedObject attr) const; + /*! * \brief Create a doc representing index access on the current ExprDoc * \param indices The indices to access. @@ -242,6 +252,7 @@ class LiteralDocNode : public ExprDocNode { class LiteralDoc : public ExprDoc { protected: explicit LiteralDoc(ObjectRef value); + LiteralDoc(ObjectRef value, ObjectPath object_path); public: /*! @@ -249,30 +260,83 @@ class LiteralDoc : public ExprDoc { */ static LiteralDoc None() { return LiteralDoc(ObjectRef(nullptr)); } + /*! + * \brief Create a LiteralDoc to represent None/null/empty value. + * \param object_path The source path of the returned Doc. + */ + static LiteralDoc None(ObjectPath object_path) { + return LiteralDoc(ObjectRef(nullptr), object_path); + } + /*! * \brief Create a LiteralDoc to represent integer. * \param v The integer value. */ static LiteralDoc Int(int v) { return LiteralDoc(IntImm(DataType::Int(64), v)); } + /*! + * \brief Create a LiteralDoc to represent integer. + * \param v The integer value. + * + * The ObjectPath of v will be pushed to the source_path of the returned doc. + */ + static LiteralDoc Int(const TracedObject& v) { return LiteralDoc(v.Get(), v.GetPath()); } + + /*! + * \brief Create a LiteralDoc to represent integer. + * \param v The integer value. + * + * The ObjectPath of v will be pushed to the source_path of the returned doc. + */ + static LiteralDoc Int(const TracedBasicValue& v) { + return LiteralDoc(IntImm(DataType::Int(64), v.Get()), v.GetPath()); + } /*! * \brief Create a LiteralDoc to represent boolean. * \param v The boolean value. */ static LiteralDoc Boolean(bool v) { return LiteralDoc(IntImm(DataType::Bool(), v)); } + /*! + * \brief Create a LiteralDoc to represent boolean. + * \param v The boolean value. + * + * The ObjectPath of v will be pushed to the source_path of the returned doc. + */ + static LiteralDoc Boolean(const TracedBasicValue& v) { + return LiteralDoc(IntImm(DataType::Bool(), v.Get()), v.GetPath()); + } + /*! * \brief Create a LiteralDoc to represent float. * \param v The float value. */ static LiteralDoc Float(double v) { return LiteralDoc(FloatImm(DataType::Float(64), v)); } + /*! + * \brief Create a LiteralDoc to represent float. + * \param v The float value. + * + * The ObjectPath of v will be pushed to the source_path of the returned doc. + */ + static LiteralDoc Float(const TracedObject& v) { + return LiteralDoc(v.Get(), v.GetPath()); + } + /*! * \brief Create a LiteralDoc to represent string. * \param v The string value. */ static LiteralDoc Str(const String& v) { return LiteralDoc(v); } + /*! + * \brief Create a LiteralDoc to represent string. + * \param v The string value. + * + * The ObjectPath of v will be pushed to the source_path of the returned doc. + */ + static LiteralDoc Str(const TracedObject& v) { return LiteralDoc(v.Get(), v.GetPath()); } + TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(LiteralDoc, ExprDoc, LiteralDocNode); }; diff --git a/include/tvm/script/printer/traced_object_functor.h b/include/tvm/script/printer/traced_object_functor.h index 6caaf8a6e0d5..8f72d139a5a5 100644 --- a/include/tvm/script/printer/traced_object_functor.h +++ b/include/tvm/script/printer/traced_object_functor.h @@ -34,35 +34,6 @@ namespace tvm { namespace script { namespace printer { -namespace { - -namespace detail { -/*! - * \brief Helper template class to extract the type of first argument of a function - * \tparam FType The function type. - */ -template -struct FirstArgTypeGetter; - -template -struct FirstArgTypeGetter { - using T = ArgOne; -}; - -/*! - * \brief Template alias for the type of first argument of a function - * \tparam FType The function type. - * - * The name of public functions are in snake case to be consistent with - * tvm/node/functor.h - */ -template -using FirstArgType = typename detail::FirstArgTypeGetter< - typename tvm::runtime::detail::function_signature::FType>::T; -} // namespace detail - -} // namespace - /* * This type alias and the following free functions are created to reduce the binary bloat * from template and also hide implementation details from this header @@ -156,8 +127,7 @@ class TracedObjectFunctor { * * The diaptch function should have signature `R(TracedObject, Args...)`. */ - template ::ObjectRefType, + template ::value>> TSelf& set_dispatch(String token, TCallable f) { return set_dispatch( @@ -177,9 +147,10 @@ class TracedObjectFunctor { * * Default dispatch function has an empty string as dispatch token. */ - template + template ::value>> TSelf& set_dispatch(TCallable&& f) { - return set_dispatch(kDefaultDispatchToken, std::forward(f)); + return set_dispatch(kDefaultDispatchToken, std::forward(f)); } /*! diff --git a/include/tvm/script/printer/var_table.h b/include/tvm/script/printer/var_table.h index 9300a976c569..2cd9335213a3 100644 --- a/include/tvm/script/printer/var_table.h +++ b/include/tvm/script/printer/var_table.h @@ -103,6 +103,17 @@ class VarTableNode : public Object { */ Optional GetVarDoc(const ObjectRef& obj, const ObjectPath& object_path) const; + /*! + * \brief Get the doc for variable. + * \param obj The traced variable object. + * + * \return The doc for variable, if it exists in the table. Otherwise it returns NullOpt. + */ + template + Optional GetVarDoc(const TracedObject obj) const { + return GetVarDoc(obj.Get(), obj.GetPath()); + } + /*! * \brief Check if a variable exists in the table. * \param obj The variable object. diff --git a/src/script/printer/doc.cc b/src/script/printer/doc.cc index d6f5ff35ab53..f3b431bd62db 100644 --- a/src/script/printer/doc.cc +++ b/src/script/printer/doc.cc @@ -27,6 +27,12 @@ namespace printer { ExprDoc ExprDocNode::Attr(String attr) const { return AttrAccessDoc(GetRef(this), attr); } +ExprDoc ExprDocNode::Attr(TracedObject attr) const { + auto doc = AttrAccessDoc(GetRef(this), attr.Get()); + doc->source_paths.push_back(attr.GetPath()); + return doc; +} + ExprDoc ExprDocNode::operator[](Array indices) const { return IndexDoc(GetRef(this), indices); } @@ -54,6 +60,13 @@ LiteralDoc::LiteralDoc(ObjectRef value) { this->data_ = std::move(n); } +LiteralDoc::LiteralDoc(ObjectRef value, ObjectPath object_path) { + ObjectPtr n = make_object(); + n->value = value; + n->source_paths.push_back(object_path); + this->data_ = std::move(n); +} + IdDoc::IdDoc(String name) { ObjectPtr n = make_object(); n->name = name; @@ -225,7 +238,8 @@ TVM_REGISTER_GLOBAL("script.printer.DocSetSourcePaths") }); TVM_REGISTER_NODE_TYPE(ExprDocNode); -TVM_REGISTER_GLOBAL("script.printer.ExprDocAttr").set_body_method(&ExprDocNode::Attr); +TVM_REGISTER_GLOBAL("script.printer.ExprDocAttr") + .set_body_method(&ExprDocNode::Attr); TVM_REGISTER_GLOBAL("script.printer.ExprDocIndex") .set_body_method(&ExprDocNode::operator[]); TVM_REGISTER_GLOBAL("script.printer.ExprDocCall") @@ -242,11 +256,15 @@ TVM_REGISTER_GLOBAL("script.printer.StmtBlockDoc").set_body_typed([](Array(LiteralDoc::None); +TVM_REGISTER_GLOBAL("script.printer.LiteralDocInt") + .set_body_typed(LiteralDoc::Int); +TVM_REGISTER_GLOBAL("script.printer.LiteralDocBoolean") + .set_body_typed(LiteralDoc::Boolean); +TVM_REGISTER_GLOBAL("script.printer.LiteralDocFloat") + .set_body_typed(LiteralDoc::Float); +TVM_REGISTER_GLOBAL("script.printer.LiteralDocStr") + .set_body_typed(LiteralDoc::Str); TVM_REGISTER_NODE_TYPE(IdDocNode); TVM_REGISTER_GLOBAL("script.printer.IdDoc").set_body_typed([](String name) { return IdDoc(name); }); diff --git a/src/script/printer/ir_docsifier.cc b/src/script/printer/ir_docsifier.cc index b72ed48db63b..7f032ec50269 100644 --- a/src/script/printer/ir_docsifier.cc +++ b/src/script/printer/ir_docsifier.cc @@ -61,7 +61,7 @@ RootNodeContainer::RootNodeContainer(ObjectRef root_node) { // }); // \endcode TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable) - .set_dispatch([](TracedObject obj, IRDocsifier p) -> Doc { + .set_dispatch([](TracedObject obj, IRDocsifier p) -> Doc { String top_dispatch_token = p->dispatch_tokens.back(); ICHECK_NE(top_dispatch_token, ""); ICHECK(false) << "Printing IR " << top_dispatch_token << " is not implemented."; diff --git a/src/script/printer/utils.h b/src/script/printer/utils.h new file mode 100644 index 000000000000..abe7ce5e9a88 --- /dev/null +++ b/src/script/printer/utils.h @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_SCRIPT_PRINTER_UTILS_H_ +#define TVM_SCRIPT_PRINTER_UTILS_H_ + +#include +#include + +#include + +namespace tvm { +namespace script { +namespace printer { + +template +Array AsDocArray(const TracedArray& refs, const IRDocsifier& ir_docsifier) { + Array result; + for (auto ref : refs) { + result.push_back(ir_docsifier->AsExprDoc(ref)); + } + return result; +} + +template +Array AsDocArray(std::initializer_list&& refs, const IRDocsifier& ir_docsifier) { + Array result; + for (auto& ref : refs) { + result.push_back(ir_docsifier->AsExprDoc(ref)); + } + return result; +} + +template +Array AsExprDocArray(const TracedArray& refs, const IRDocsifier& ir_docsifier) { + return AsDocArray(refs, ir_docsifier); +} + +template +Array AsExprDocArray(std::initializer_list&& refs, + const IRDocsifier& ir_docsifier) { + return AsDocArray(std::move(refs), ir_docsifier); +} + +inline DictDoc AsDictDoc(const TracedMap& dict, + const IRDocsifier& ir_docsifier) { + Array keys; + Array values; + + for (auto p : dict) { + keys.push_back(LiteralDoc::Str(p.first)); + values.push_back(ir_docsifier->AsExprDoc(p.second)); + } + + auto doc = DictDoc(keys, values); + doc->source_paths.push_back(dict.GetPath()); + return doc; +} + +template +inline ListDoc AsListDoc(const TracedArray& arr, const IRDocsifier& ir_docsifier) { + auto ret = ListDoc(AsExprDocArray(arr, ir_docsifier)); + ret->source_paths.push_back(arr.GetPath()); + return ret; +} + +template +inline TupleDoc AsTupleDoc(const TracedArray& arr, const IRDocsifier& ir_docsifier) { + auto ret = TupleDoc(AsExprDocArray(arr, ir_docsifier)); + ret->source_paths.push_back(arr.GetPath()); + return ret; +} + +} // namespace printer +} // namespace script +} // namespace tvm + +#endif // TVM_SCRIPT_PRINTER_UTILS_H_ diff --git a/src/script/printer/var_table.cc b/src/script/printer/var_table.cc index 49ba93f9bcfe..62d8b2f66cc2 100644 --- a/src/script/printer/var_table.cc +++ b/src/script/printer/var_table.cc @@ -99,7 +99,8 @@ TVM_REGISTER_GLOBAL("script.printer.VarTableDefineByDoc") obj, [f = std::move(factory)]() { return f(); }, frame); }); TVM_REGISTER_GLOBAL("script.printer.VarTableGetVarDoc") - .set_body_method(&VarTableNode::GetVarDoc); + .set_body_method, const ObjectRef&, + const ObjectPath&>(&VarTableNode::GetVarDoc); TVM_REGISTER_GLOBAL("script.printer.VarTableIsVarDefined") .set_body_method(&VarTableNode::IsVarDefined); diff --git a/tests/cpp/tvmscript_printer_irdocsifier_test.cc b/tests/cpp/tvmscript_printer_irdocsifier_test.cc index fcdb5ed04e41..8c68399df222 100644 --- a/tests/cpp/tvmscript_printer_irdocsifier_test.cc +++ b/tests/cpp/tvmscript_printer_irdocsifier_test.cc @@ -45,14 +45,19 @@ class TestObject : public ObjectRef { TVM_REGISTER_NODE_TYPE(TestObjectNode); TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable) - .set_dispatch([](TracedObject obj, IRDocsifier p) { return IdDoc("x"); }); + .set_dispatch([](TracedObject obj, IRDocsifier p) { + return IdDoc("x"); + }); TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable) - .set_dispatch("tir", [](TracedObject obj, IRDocsifier p) { return IdDoc("tir"); }); + .set_dispatch("tir", [](TracedObject obj, IRDocsifier p) { + return IdDoc("tir"); + }); TVM_STATIC_IR_FUNCTOR(IRDocsifier, vtable) - .set_dispatch("relax", - [](TracedObject obj, IRDocsifier p) { return IdDoc("relax"); }); + .set_dispatch("relax", [](TracedObject obj, IRDocsifier p) { + return IdDoc("relax"); + }); TEST(PrinterIRDocsifierTest, AsDoc) { IRDocsifier p(Map{}); diff --git a/tests/cpp/tvmscript_printer_traced_object_functor_test.cc b/tests/cpp/tvmscript_printer_traced_object_functor_test.cc index 374eb609b6cb..d662ce132405 100644 --- a/tests/cpp/tvmscript_printer_traced_object_functor_test.cc +++ b/tests/cpp/tvmscript_printer_traced_object_functor_test.cc @@ -33,7 +33,7 @@ class FooObjectNode : public Object { public: void VisitAttrs(AttrVisitor* v) {} - static constexpr const char* _type_key = "test.FooObject"; + static constexpr const char* _type_key = "test.TracedObjectFunctor.FooObject"; TVM_DECLARE_FINAL_OBJECT_INFO(FooObjectNode, Object); }; @@ -49,7 +49,7 @@ class BarObjectNode : public Object { public: void VisitAttrs(AttrVisitor* v) {} - static constexpr const char* _type_key = "test.BarObject"; + static constexpr const char* _type_key = "test.TracedObjectFunctor.BarObject"; TVM_DECLARE_FINAL_OBJECT_INFO(BarObjectNode, Object); }; @@ -69,8 +69,8 @@ TEST(TracedObjectFunctorTest, NormalRegistration) { TracedObjectFunctor functor; ObjectPath path = ObjectPath::Root(); - functor.set_dispatch([](TracedObject o) -> String { return "Foo"; }); - functor.set_dispatch([](TracedObject o) -> String { return "Bar"; }); + functor.set_dispatch([](TracedObject o) -> String { return "Foo"; }); + functor.set_dispatch([](TracedObject o) -> String { return "Bar"; }); ICHECK_EQ(functor("", MakeTraced(FooObject(), path)), "Foo"); ICHECK_EQ(functor("", MakeTraced(BarObject(), path)), "Bar"); @@ -80,8 +80,8 @@ TEST(TracedObjectFunctorTest, RegistrationWithFunction) { TracedObjectFunctor functor; ObjectPath path = ObjectPath::Root(); - functor.set_dispatch([](TracedObject o) -> String { return "FooLambda"; }); - functor.set_dispatch("tir", ComputeFoo); + functor.set_dispatch([](TracedObject o) -> String { return "FooLambda"; }); + functor.set_dispatch("tir", ComputeFoo); ICHECK_EQ(functor("", MakeTraced(FooObject(), path)), "FooLambda"); ICHECK_EQ(functor("tir", MakeTraced(FooObject(), path)), "Foo"); @@ -91,9 +91,11 @@ TEST(TracedObjectFunctorTest, RegistrationWithDispatchToken) { TracedObjectFunctor functor; ObjectPath path = ObjectPath::Root(); - functor.set_dispatch([](TracedObject o) -> String { return "Foo"; }); - functor.set_dispatch("tir", [](TracedObject o) -> String { return "Foo tir"; }); - functor.set_dispatch("relax", [](TracedObject o) -> String { return "Foo relax"; }); + functor.set_dispatch([](TracedObject o) -> String { return "Foo"; }); + functor.set_dispatch("tir", + [](TracedObject o) -> String { return "Foo tir"; }); + functor.set_dispatch("relax", + [](TracedObject o) -> String { return "Foo relax"; }); ICHECK_EQ(functor("", MakeTraced(FooObject(), path)), "Foo"); ICHECK_EQ(functor("tir", MakeTraced(FooObject(), path)), "Foo tir"); @@ -119,8 +121,8 @@ TEST(TracedObjectFunctorTest, ExtraArg) { TracedObjectFunctor functor; ObjectPath path = ObjectPath::Root(); - functor.set_dispatch([](TracedObject o, int x) { return x; }); - functor.set_dispatch([](TracedObject o, int x) { return x + 1; }); + functor.set_dispatch([](TracedObject o, int x) { return x; }); + functor.set_dispatch([](TracedObject o, int x) { return x + 1; }); ICHECK_EQ(functor("", MakeTraced(FooObject(), path), 2), 2); ICHECK_EQ(functor("", MakeTraced(BarObject(), path), 2), 3); @@ -131,8 +133,9 @@ TEST(TracedObjectFunctorTest, RemoveDispatchFunction) { TracedObjectFunctor functor; ObjectPath path = ObjectPath::Root(); - functor.set_dispatch([](TracedObject o) -> String { return "Foo"; }); - functor.set_dispatch("tir", [](TracedObject o) -> String { return "Foo tir"; }); + functor.set_dispatch([](TracedObject o) -> String { return "Foo"; }); + functor.set_dispatch("tir", + [](TracedObject o) -> String { return "Foo tir"; }); ICHECK_EQ(functor("", MakeTraced(FooObject(), path)), "Foo"); ICHECK_EQ(functor("tir", MakeTraced(FooObject(), path)), "Foo tir"); @@ -158,11 +161,11 @@ TEST(TracedObjectFunctorTest, DuplicateRegistration_WithoutToken) { TracedObjectFunctor functor; ObjectPath path = ObjectPath::Root(); - functor.set_dispatch([](TracedObject o, int x) { return x; }); + functor.set_dispatch([](TracedObject o, int x) { return x; }); bool failed = false; try { - functor.set_dispatch([](TracedObject o, int x) { return x; }); + functor.set_dispatch([](TracedObject o, int x) { return x; }); } catch (...) { failed = true; } @@ -173,11 +176,11 @@ TEST(TracedObjectFunctorTest, DuplicateRegistration_WithToken) { TracedObjectFunctor functor; ObjectPath path = ObjectPath::Root(); - functor.set_dispatch("tir", [](TracedObject o, int x) { return x; }); + functor.set_dispatch("tir", [](TracedObject o, int x) { return x; }); bool failed = false; try { - functor.set_dispatch("tir", [](TracedObject o, int x) { return x; }); + functor.set_dispatch("tir", [](TracedObject o, int x) { return x; }); } catch (...) { failed = true; } From d1871a6957b4f469f1b994aa6c89e0d209b64f05 Mon Sep 17 00:00:00 2001 From: Ruihang Lai Date: Sat, 17 Sep 2022 22:03:17 -0400 Subject: [PATCH 200/704] [MetaSchedule] Relax conditions of rule Cross-Thread Reduction (#12825) This PR relaxes the conditions of Meta-Schedule schedule rule CrossThreadReduction. The rules are previously a bit over-strict, and some workloads with small reduction loop length are unable to be optimized by cross-thread reduction automatically. In this PR, we relax the rules so that such workloads can be optimized. --- src/tir/schedule/analysis/analysis.cc | 6 +- ...le_schedule_rule_cross_thread_reduction.py | 98 +++++++++++++++++++ 2 files changed, 100 insertions(+), 4 deletions(-) diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc index 4f78b0c9cd43..e39f7b25543c 100644 --- a/src/tir/schedule/analysis/analysis.cc +++ b/src/tir/schedule/analysis/analysis.cc @@ -1640,11 +1640,9 @@ bool NeedsRFactorOrCrossThreadReduction(const tir::ScheduleState& self, // if (NeedsMultiLevelTiling(self, block_sref)) { // Do not use rfactor/cross-thread-reduction if we have enough parallelism on spatial loops. return !(cum_space_len >= cum_reduce_len || cum_space_len > max_parallel_extent); - } else if (cum_reduce_len > 1) { - // Always try rfactor/cross-thread-reduction for other reduction blocks. - return cum_reduce_len > max_parallel_basic; } else { - return false; + // Always try rfactor/cross-thread-reduction for other reduction blocks. + return cum_reduce_len > 1; } } diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py index 4278638a1aa3..718b264bddd2 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py @@ -589,6 +589,28 @@ def argmax( argmax_v1[i] = v_argmax_v1 +@T.prim_func +def argmax_32( + idx: T.Buffer[(1, 32), "int32"], + val: T.Buffer[(1, 32), "float32"], + argmax_v0: T.Buffer[(1,), "int32"], + argmax_v1: T.Buffer[(1,), "float32"], +) -> None: + for i0, i1 in T.grid(1, 32): + with T.block("argmax"): + i = T.axis.spatial(1, i0) + k = T.axis.reduce(32, i1) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.min_value("float32") + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k]) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + def test_gpu_argmax(): @T.prim_func def argmax_0( @@ -663,8 +685,84 @@ def argmax_1( ) +def test_gpu_argmax_32(): + @T.prim_func + def argmax_0( + idx: T.Buffer[(1, 32), "int32"], + val: T.Buffer[(1, 32), "float32"], + argmax_v0: T.Buffer[(1,), "int32"], + argmax_v1: T.Buffer[(1,), "float32"], + ) -> None: + # body + # with T.block("root") + for i0, i1 in T.grid(1, 32): + with T.block("argmax"): + i, k = T.axis.remap("SR", [i0, i1]) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.float32(-3.4028234663852886e38) + v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k]) + v_argmax_v1: T.float32 = T.Select( + argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k] + ) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + @T.prim_func + def argmax_1( + idx: T.Buffer[(1, 32), "int32"], + val: T.Buffer[(1, 32), "float32"], + argmax_v0: T.Buffer[(1,), "int32"], + argmax_v1: T.Buffer[(1,), "float32"], + ) -> None: + # body + # with T.block("root") + for i0, i1_0 in T.grid(1, 1): + for i1_1 in T.thread_binding(64, thread="threadIdx.x"): + with T.block("argmax"): + i = T.axis.spatial(1, i0) + k = T.axis.reduce(32, i1_0 * 64 + i1_1) + T.where(i1_0 * 64 + i1_1 < 32) + T.reads(idx[i, k], val[i, k]) + T.writes(argmax_v0[i], argmax_v1[i]) + with T.init(): + argmax_v0[i] = -1 + argmax_v1[i] = T.float32(-3.4028234663852886e38) + v_argmax_v0: T.int32 = T.Select( + argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k] + ) + v_argmax_v1: T.float32 = T.Select( + argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k] + ) + argmax_v0[i] = v_argmax_v0 + argmax_v1[i] = v_argmax_v1 + + decision_0 = [] # type: ignore + decision_1 = [ + ("SampleCategorical", 4), + ] + + mod = argmax_32 + actual = ms.TuneContext( + mod=mod, + target=Target("nvidia/geforce-rtx-3090", host="llvm"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=get_rules("cuda", ms.schedule_rule.CrossThreadReduction), + task_name="test", + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[argmax_0, argmax_1], + expected_decisions=[decision_0, decision_1], + ) + + if __name__ == "__main__": test_gpu_softmax_mn() test_gpu_softmax_mn_after_inline() test_gpu_batch_norm_bmn() test_gpu_argmax() + test_gpu_argmax_32() From b2c5addbb4e92aa770f0cd0847eabb43400ac9d2 Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Sat, 17 Sep 2022 19:18:01 -0700 Subject: [PATCH 201/704] [TVMScript] IRBuilder methods for `Stmt` (#12830) This PR introduces IRBuilder methods for `Assert`, `Let`, `Realize`, `Evaluate`, `LaunchThread`, `EnvThread`. Co-authored-by: yongwww --- include/tvm/script/ir_builder/tir/frame.h | 132 ++++++++++++++++++ include/tvm/script/ir_builder/tir/ir.h | 40 ++++++ python/tvm/script/ir_builder/tir/frame.py | 20 +++ python/tvm/script/ir_builder/tir/ir.py | 131 +++++++++++++++++ src/script/ir_builder/tir/frame.cc | 27 ++++ src/script/ir_builder/tir/ir.cc | 67 +++++++++ .../unittest/test_tvmscript_ir_builder_tir.py | 69 +++++++++ 7 files changed, 486 insertions(+) diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h index c76b400d96b4..38fe9009dd61 100644 --- a/include/tvm/script/ir_builder/tir/frame.h +++ b/include/tvm/script/ir_builder/tir/frame.h @@ -303,6 +303,138 @@ class AssertFrameNode : public TIRFrameNode { void ExitWithScope() final; }; +/*! + * \brief Managed reference to AssertFrameNode. + * + * \sa AssertFrameNode + */ +class AssertFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(AssertFrame, TIRFrame, AssertFrameNode); +}; + +/*! + * \brief A frame represents the let binding expression, which binds a var. + * + * \sa LetFrameNode + */ +class LetFrameNode : public TIRFrameNode { + public: + /*! \brief The variable we bind to */ + tvm::tir::Var var; + /*! \brief The value we bind var to */ + PrimExpr value; + + void VisitAttrs(tvm::AttrVisitor* v) { + TIRFrameNode::VisitAttrs(v); + v->Visit("var", &var); + v->Visit("value", &value); + } + + static constexpr const char* _type_key = "script.ir_builder.tir.LetFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(LetFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to LetFrameNode. + * + * \sa LetFrameNode + */ +class LetFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(LetFrame, TIRFrame, LetFrameNode); +}; + +/*! + * \brief The LaunchThreadFrameNode. + * \note It is used only inside a PrimFunc. + */ +class LaunchThreadFrameNode : public TIRFrameNode { + public: + /*! \brief The extent of environment thread. */ + PrimExpr extent; + /*! \brief The attribute key, could be either virtual_thread or thread_extent. */ + String attr_key; + /*! \brief The iteration variable. */ + tvm::tir::IterVar iter_var; + + void VisitAttrs(tvm::AttrVisitor* v) { + TIRFrameNode::VisitAttrs(v); + v->Visit("extent", &extent); + v->Visit("attr_key", &attr_key); + v->Visit("iter_var", &iter_var); + } + + static constexpr const char* _type_key = "script.ir_builder.tir.LaunchThreadFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(LaunchThreadFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to LaunchThreadFrameNode. + * + * \sa LaunchThreadFrameNode + */ +class LaunchThreadFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(LaunchThreadFrame, TIRFrame, + LaunchThreadFrameNode); +}; + +/*! + * \brief A frame that represents realization. + * + * \sa RealizeFrame + */ +class RealizeFrameNode : public TIRFrameNode { + public: + /*! \brief The region of buffer access. */ + tvm::tir::BufferRegion buffer_slice; + /*! \brief The storage scope associated with this realization. */ + String storage_scope; + /*! \brief The condition expression. */ + PrimExpr condition; + + void VisitAttrs(tvm::AttrVisitor* v) { + TIRFrameNode::VisitAttrs(v); + v->Visit("buffer_slice", &buffer_slice); + v->Visit("storage_scope", &storage_scope); + v->Visit("condition", &condition); + } + + static constexpr const char* _type_key = "script.ir_builder.tir.RealizeFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(RealizeFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to RealizeFrameNode. + * + * \sa RealizeFrameNode + */ +class RealizeFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(RealizeFrame, TIRFrame, RealizeFrameNode); +}; } // namespace tir } // namespace ir_builder } // namespace script diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h index 191887648dbd..ec1f7f3753d1 100644 --- a/include/tvm/script/ir_builder/tir/ir.h +++ b/include/tvm/script/ir_builder/tir/ir.h @@ -292,6 +292,46 @@ ForFrame ThreadBinding(PrimExpr start, PrimExpr stop, String thread, */ ForFrame Grid(Array extents); +/*! + * \brief The assertion statement. + * \param condition The assertion condition. + * \param message The error message when the assertion fails. + * \return The AssertFrame. + */ +AssertFrame Assert(PrimExpr condition, String message); + +/*! + * \brief The let binding. + * \param var The variable to bind. + * \param value The value to be bound. + * \return The created LetFrame. + */ +LetFrame Let(Var var, PrimExpr value); + +/*! + * \brief The realization. + * \param buffer_slice The region of buffer access. + * \param storage_scope The storage scope associated with this realization. + * \param condition The condition expression. + * \return The result RealizeFrame. + */ +RealizeFrame Realize(tvm::tir::BufferRegion buffer_slice, String storage_scope, PrimExpr condition); + +/*! + * \brief Launch a thread. + * \param var The iteration variable. + * \param extent The extent of environment thread. + * \return The result LaunchThreadFrame. + */ +LaunchThreadFrame LaunchThread(Var var, PrimExpr extent); + +/*! + * \brief Bind a var to thread env. + * \param thread_tag The thread type tag. + * \return The result variable which gets bound to the thread env. + */ +Var EnvThread(String thread_tag); + /*! * \brief Evaluate the input expression. * \param value The input expression to evaluate. diff --git a/python/tvm/script/ir_builder/tir/frame.py b/python/tvm/script/ir_builder/tir/frame.py index 2ad08f35160d..69bc5bfc9676 100644 --- a/python/tvm/script/ir_builder/tir/frame.py +++ b/python/tvm/script/ir_builder/tir/frame.py @@ -48,3 +48,23 @@ class ForFrame(TIRFrame): def __enter__(self) -> Union[Var, List[Var]]: # type: ignore[override] super().__enter__() return self.vars if len(self.vars) > 1 else self.vars[0] + + +@_register_object("script.ir_builder.tir.AssertFrame") +class AssertFrame(TIRFrame): + ... + + +@_register_object("script.ir_builder.tir.LetFrame") +class LetFrame(TIRFrame): + ... + + +@_register_object("script.ir_builder.tir.RealizeFrame") +class RealizeFrame(TIRFrame): + ... + + +@_register_object("script.ir_builder.tir.LaunchThreadFrame") +class LaunchThreadFrame(TIRFrame): + ... diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py index d1dc1c89600d..6db8f40c32c8 100644 --- a/python/tvm/script/ir_builder/tir/ir.py +++ b/python/tvm/script/ir_builder/tir/ir.py @@ -26,6 +26,8 @@ BufferLoad, BufferRegion, IntImm, + IterVar, + Let, PrimExpr, StringImm, Var, @@ -813,6 +815,130 @@ def grid(*extents: PrimExpr) -> frame.ForFrame: return _ffi_api.Grid(extents) # type: ignore[attr-defined] # pylint: disable=no-member +def Assert(condition: PrimExpr, message: str) -> frame.AssertFrame: # pylint: disable=invalid-name + """Create an assertion statement. + + Parameters + ---------- + condition : PrimExpr + The PrimExpr to test. + + message : str + The output error message when the assertion fails. + + Returns + ------- + res : frame.AssertFrame + The result AssertFrame. + """ + return _ffi_api.Assert(condition, message) # type: ignore[attr-defined] # pylint: disable=no-member + + +def let( + v: Var, + value: PrimExpr, + body: PrimExpr = None, +) -> frame.LetFrame: + """Create a new let binding. + + Parameters + ---------- + v : Var + The variable to bind. + + value : PrimExpr + The value to be bound. + + body : PrimExpr + The body expression, None will be used if it was not specified. + + Returns + ------- + res : frame.LetFrame + The result LetFrame. + """ + if body is None: + return _ffi_api.Let(v, value) # type: ignore[attr-defined] # pylint: disable=no-member + return Let(v, value, body) + + +def realize( + buffer_slice: BufferRegion, + storage_scope: str, + condition: PrimExpr = True, +) -> frame.RealizeFrame: + """Create a realization. + + Parameters + ---------- + buffer_slice : BufferRegion + The region of buffer access. + + storage_scope : str + The storage scope associated with this realization. + + condition: PrimExpr + The condition expression, the default is True. + + Returns + ------- + res : frame.RealizeFrame + The result RealizeFrame. + """ + return _ffi_api.Realize( # type: ignore[attr-defined] # pylint: disable=no-member + buffer_slice, storage_scope, condition + ) + + +def launch_thread( + iter_var: IterVar, # pylint: disable=redefined-outer-name + extent: PrimExpr, +) -> frame.LaunchThreadFrame: + """Launch a thread. + + Parameters + ---------- + iter_var : IterVar + The iteration variable. + + extent : PrimExpr + The extent of environment thread. + + Returns + ------- + res : frame.LaunchThreadFrame + The result LaunchThreadFrame. + + Examples + -------- + + .. code-block:: python + + from tvm.script.ir_builder import tir as T + brow = T.env_thread("blockIdx.y") + T.launch_thread(brow, 1) + + """ + return _ffi_api.LaunchThread(iter_var, extent) # type: ignore[attr-defined] # pylint: disable=no-member + + +def env_thread(thread_tag: str) -> IterVar: + """Bind a var to thread env" + + Parameters + ---------- + thread_tag : str + The thread type tag. + + Returns + ------- + res : IterVar + The result iteration variable gets bound to the thread env. + + """ + return _ffi_api.EnvThread(thread_tag) # type: ignore[attr-defined] # pylint: disable=no-member + + def evaluate(value: PrimExpr) -> None: """Evaluate the input expression. @@ -1159,6 +1285,11 @@ def var(dtype, name="") -> Var: "unroll", "thread_binding", "grid", + "Assert", + "let", + "realize", + "launch_thread", + "env_thread", "evaluate", "int8", "int16", diff --git a/src/script/ir_builder/tir/frame.cc b/src/script/ir_builder/tir/frame.cc index 8b8b2a4d80e0..6c9459e6389c 100644 --- a/src/script/ir_builder/tir/frame.cc +++ b/src/script/ir_builder/tir/frame.cc @@ -92,11 +92,38 @@ void ForFrameNode::ExitWithScope() { AddToParent(this->f_make_for_loop(vars, doms, AsStmt(stmts))); } +void AssertFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + AddToParent(tvm::tir::AssertStmt(condition, message, AsStmt(stmts))); +} + +void LetFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + AddToParent(tvm::tir::LetStmt(var, value, AsStmt(stmts))); +} + +void RealizeFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + AddToParent(tvm::tir::AttrStmt(buffer_slice->buffer, "realize_scope", + tvm::tir::StringImm(storage_scope), + tvm::tir::BufferRealize(buffer_slice->buffer, buffer_slice->region, + condition, AsStmt(stmts)))); +} + +void LaunchThreadFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + AddToParent(tvm::tir::AttrStmt(iter_var, attr_key, extent, AsStmt(stmts))); +} + TVM_REGISTER_NODE_TYPE(TIRFrameNode); TVM_REGISTER_NODE_TYPE(PrimFuncFrameNode); TVM_REGISTER_NODE_TYPE(BlockFrameNode); TVM_REGISTER_NODE_TYPE(BlockInitFrameNode); TVM_REGISTER_NODE_TYPE(ForFrameNode); +TVM_REGISTER_NODE_TYPE(AssertFrameNode); +TVM_REGISTER_NODE_TYPE(LetFrameNode); +TVM_REGISTER_NODE_TYPE(RealizeFrameNode); +TVM_REGISTER_NODE_TYPE(LaunchThreadFrameNode); } // namespace tir } // namespace ir_builder diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc index 75e759262655..5951af298f62 100644 --- a/src/script/ir_builder/tir/ir.cc +++ b/src/script/ir_builder/tir/ir.cc @@ -395,6 +395,67 @@ ForFrame Grid(Array extents) { return ForFrame(n); } +AssertFrame Assert(PrimExpr condition, String message) { + ObjectPtr n = make_object(); + n->condition = condition; + n->message = tvm::tir::StringImm(message); + return AssertFrame(n); +} + +LetFrame Let(Var var, PrimExpr value) { + ObjectPtr n = make_object(); + n->var = var; + n->value = value; + return LetFrame(n); +} + +LaunchThreadFrame LaunchThread(Var var, PrimExpr extent) { + IterVar iter_var{nullptr}; + + if (Optional opt_frame = IRBuilder::Current()->FindFrame()) { + if (Optional opt_iter_var = opt_frame.value()->env_threads.Get(var)) { + iter_var = opt_iter_var.value(); + } else { + LOG(FATAL) << "ValueError: " << var->name_hint + << " is not an env_thread created using T.env_thread."; + } + } else { + LOG(FATAL) << "LaunchThread can only be used inside a PrimFunc"; + } + ObjectPtr n = make_object(); + if (!iter_var->dom.defined()) { + const_cast(iter_var.get())->dom = Range(0, extent); + } else if (!arith::Analyzer().CanProveEqual(iter_var->dom->extent, extent)) { + LOG(FATAL) << "ValueError: Inconsistent extents of environment thread. " + << iter_var->dom->extent << " vs " << extent; + } + n->iter_var = iter_var; + n->extent = extent; + n->attr_key = iter_var->thread_tag == "vthread" ? "virtual_thread" : "thread_extent"; + return LaunchThreadFrame(n); +} + +RealizeFrame Realize(tvm::tir::BufferRegion buffer_slice, String storage_scope, + PrimExpr condition) { + ObjectPtr n = make_object(); + n->buffer_slice = buffer_slice; + n->storage_scope = storage_scope; + n->condition = condition; + return RealizeFrame(n); +} + +Var EnvThread(String thread_tag) { + IterVar iter_var(Range{nullptr}, Var("", DataType::Int(32)), tvm::tir::IterVarType::kThreadIndex, + thread_tag); + Var var = iter_var->var; + if (Optional opt_frame = IRBuilder::Current()->FindFrame()) { + opt_frame.value()->env_threads.Set(var, iter_var); + } else { + LOG(FATAL) << "EnvThread can only be used inside a PrimFunc"; + } + return var; +} + void Evaluate(PrimExpr value) { AddToParent(tvm::tir::Evaluate(value)); } using tvm::script::ir_builder::details::Namer; @@ -477,6 +538,12 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.Unroll").set_body_typed(Unroll); TVM_REGISTER_GLOBAL("script.ir_builder.tir.ThreadBinding").set_body_typed(ThreadBinding); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Grid").set_body_typed(Grid); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Assert").set_body_typed(Assert); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Let").set_body_typed(Let); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Realize").set_body_typed(Realize); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.LaunchThread").set_body_typed(LaunchThread); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.EnvThread").set_body_typed(EnvThread); + TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int8").set_body_typed(Int8); diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py index a5d8c1068064..7f2e6e1a4706 100644 --- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py +++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py @@ -260,5 +260,74 @@ def test_ir_builder_tir_for(): assert_structural_equal(for_actual, for_expected, map_free_vars=True) +def test_ir_builder_tir_assert(): + with IRBuilder() as ib: + with T.Assert(T.var("int32", name="a") == 0, message="a is 0"): + T.evaluate(0) + # the assert generated by IRBuilder + assert_actual = ib.get() + + # the expected assert statement + assert_expected = tir.AssertStmt( + T.var("int32", name="a") == 0, tir.StringImm("a is 0"), tir.Evaluate(0) + ) + # Check if the generated ir is expected + assert_structural_equal(assert_actual, assert_expected, map_free_vars=True) + + +def test_ir_builder_tir_evaluate(): + with IRBuilder() as ib: + T.evaluate(0) + # the evaluate generated by IRBuilder + eval_actual = ib.get() + + # the expected evaluate + eval_expected = tir.Evaluate(0) + # Check if the generated ir is expected + assert_structural_equal(eval_actual, eval_expected, map_free_vars=True) + + +def test_ir_builder_tir_let(): + with IRBuilder() as ib: + with T.let(T.var("int32", name="a"), tir.IntImm("int32", 2)): + T.evaluate(0) + # the let binding generated by IRBuilder + let_actual = ib.get() + + # the expected Let statement + let_expected = tir.LetStmt(T.var("int32", name="a"), tir.IntImm("int32", 2), tir.Evaluate(0)) + assert_structural_equal(let_actual, let_expected, map_free_vars=True) + + +def test_ir_builder_tir_realize(): + buffer_a = T.buffer_decl((128, 128), "float32") + with IRBuilder() as ib: + with T.realize(buffer_a[0:128, 0:128], "test_storage_scope", True): + T.evaluate(0) + realize_actual = ib.get() + + # the expected buffer realization + buffer_realize = tir.BufferRealize( + buffer_a, [tvm.ir.Range(0, 128), tvm.ir.Range(0, 128)], True, tir.Evaluate(0) + ) + expected_realize = tir.AttrStmt( + buffer_a, "realize_scope", tir.StringImm("test_storage_scope"), buffer_realize + ) + assert_structural_equal(realize_actual, expected_realize, map_free_vars=True) + + +def test_ir_builder_tir_thread(): + with IRBuilder() as ib: + with T.prim_func(): + brow = T.env_thread("blockIdx.y") + with T.launch_thread(brow, 1): + T.evaluate(0) + ir_actual = ib.get() + iter_var = tir.IterVar((0, 1), "v", iter_type=1, thread_tag="blockIdx.y") + attr_stmt = tir.AttrStmt(iter_var, "thread_extent", 1, tir.Evaluate(0)) + func = tir.PrimFunc([], attr_stmt) + assert_structural_equal(ir_actual, func, map_free_vars=True) + + if __name__ == "__main__": tvm.testing.main() From 052e7028271be2aa2932e8721faf847940d28429 Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Sun, 18 Sep 2022 11:51:23 -0700 Subject: [PATCH 202/704] [TVMScript] IRBuilder methods for `Stmt` (#12831) This PR introduces IRBuilder methods for `allocate`, `Let`, `allocate_const`, `attr`, `While`, `If/Then/Else`, `decl_buffer`, `buffer_store`, `prefetch`. Co-authored-by: yongwww --- include/tvm/script/ir_builder/tir/frame.h | 307 ++++++++++++++++++ include/tvm/script/ir_builder/tir/ir.h | 97 ++++++ python/tvm/script/ir_builder/tir/frame.py | 48 ++- python/tvm/script/ir_builder/tir/ir.py | 271 ++++++++++++++++ src/script/ir_builder/tir/frame.cc | 78 +++++ src/script/ir_builder/tir/ir.cc | 86 +++++ src/script/ir_builder/tir/utils.h | 15 + .../unittest/test_tvmscript_ir_builder_tir.py | 173 +++++++++- 8 files changed, 1061 insertions(+), 14 deletions(-) diff --git a/include/tvm/script/ir_builder/tir/frame.h b/include/tvm/script/ir_builder/tir/frame.h index 38fe9009dd61..aa2386e7f1e4 100644 --- a/include/tvm/script/ir_builder/tir/frame.h +++ b/include/tvm/script/ir_builder/tir/frame.h @@ -435,6 +435,313 @@ class RealizeFrame : public TIRFrame { public: TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(RealizeFrame, TIRFrame, RealizeFrameNode); }; + +/*! + * \brief A frame represents the allocate. + * + * \sa AllocateFrame + */ +class AllocateFrameNode : public TIRFrameNode { + public: + /*! \brief The extents of the allocate. */ + Array extents; + /*! \brief The data type of the buffer. */ + DataType dtype; + /*! \brief The storage scope. */ + String storage_scope; + /*! \brief The condition. */ + PrimExpr condition; + /*! \brief Additional annotation hints. */ + Map annotations; + /*! \brief The buffer. */ + tvm::tir::Buffer buffer; + + void VisitAttrs(tvm::AttrVisitor* v) { + TIRFrameNode::VisitAttrs(v); + v->Visit("extents", &extents); + v->Visit("dtype", &dtype); + v->Visit("storage_scope", &storage_scope); + v->Visit("condition", &condition); + v->Visit("annotations", &annotations); + v->Visit("buffer", &buffer); + } + + static constexpr const char* _type_key = "script.ir_builder.tir.AllocateFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(AllocateFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to AllocateFrameNode. + * + * \sa AllocateFrameNode + */ +class AllocateFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(AllocateFrame, TIRFrame, AllocateFrameNode); +}; + +/*! + * \brief A frame represents the allocate constant. + * + * \sa AllocateConstFrame + */ +class AllocateConstFrameNode : public TIRFrameNode { + public: + /*! \brief The data type of the buffer. */ + DataType dtype; + /*! \brief The extents of the allocate. */ + Array extents; + /*! \brief The data associated with the constant. */ + tvm::runtime::NDArray data; + /*! \brief The buffer */ + tvm::tir::Buffer buffer; + /*! \brief Additional annotations about the allocation. */ + Map annotations; + + void VisitAttrs(tvm::AttrVisitor* v) { + TIRFrameNode::VisitAttrs(v); + v->Visit("dtype", &dtype); + v->Visit("extents", &extents); + v->Visit("data", &data); + v->Visit("buffer", &buffer); + v->Visit("annotations", &annotations); + } + + static constexpr const char* _type_key = "script.ir_builder.tir.AllocateConstFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(AllocateConstFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to AllocateConstFrameNode. + * + * \sa AllocateConstFrameNode + */ +class AllocateConstFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(AllocateConstFrame, TIRFrame, + AllocateConstFrameNode); +}; +/*! + * \brief A frame that represents attribute node. + * + * \sa AttrFrame + */ +class AttrFrameNode : public TIRFrameNode { + public: + /*! \brief The node to annotate the attribute. */ + ObjectRef node; + /*! \brief Attribute type key. */ + String attr_key; + /*! \brief The value of the attribute. */ + PrimExpr value; + + void VisitAttrs(tvm::AttrVisitor* v) { + TIRFrameNode::VisitAttrs(v); + v->Visit("node", &node); + v->Visit("attr_key", &attr_key); + v->Visit("value", &value); + } + + static constexpr const char* _type_key = "script.ir_builder.tir.AttrFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(AttrFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to AttrFrameNode. + * + * \sa AttrFrameNode + */ +class AttrFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(AttrFrame, TIRFrame, AttrFrameNode); +}; + +/*! + * \brief A frame that represents while loop. + * + * \sa WhileFrame + */ +class WhileFrameNode : public TIRFrameNode { + public: + /*! \brief The termination condition of while. */ + PrimExpr condition; + + void VisitAttrs(tvm::AttrVisitor* v) { + TIRFrameNode::VisitAttrs(v); + v->Visit("condition", &condition); + } + + static constexpr const char* _type_key = "script.ir_builder.tir.WhileFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(WhileFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to WhileFrameNode. + * + * \sa WhileFrameNode + */ +class WhileFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(WhileFrame, TIRFrame, WhileFrameNode); +}; + +/*! + * \brief A frame that represents if statement. + * + * \sa IfFrame + */ +class IfFrameNode : public TIRFrameNode { + public: + /*! \brief The condition of the if statement. */ + PrimExpr condition; + /*! \brief The statements in the true branch. */ + Optional> then_stmts; + /*! \brief The stetements in the false branch. */ + Optional> else_stmts; + + void VisitAttrs(tvm::AttrVisitor* v) { + TIRFrameNode::VisitAttrs(v); + v->Visit("condition", &condition); + v->Visit("then_stmts", &then_stmts); + v->Visit("else_stmts", &else_stmts); + } + + static constexpr const char* _type_key = "script.ir_builder.tir.IfFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(IfFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to IfFrameNode. + * + * \sa IfFrameNode + */ +class IfFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(IfFrame, TIRFrame, IfFrameNode); +}; + +/*! + * \brief A frame that represents then. + * + * \sa ThenFrame + */ +class ThenFrameNode : public TIRFrameNode { + public: + static constexpr const char* _type_key = "script.ir_builder.tir.ThenFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(ThenFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when entering RAII scope. + * \sa tvm::support::With + */ + void EnterWithScope() final; + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to ThenFrameNode. + * + * \sa ThenFrameNode + */ +class ThenFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(ThenFrame, TIRFrame, ThenFrameNode); +}; + +/*! + * \brief A frame that represents else. + * + * \sa ElseFrame + */ +class ElseFrameNode : public TIRFrameNode { + public: + static constexpr const char* _type_key = "script.ir_builder.tir.ElseFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(ElseFrameNode, TIRFrameNode); + + public: + /*! + * \brief The method called when entering RAII scope. + * \sa tvm::support::With + */ + void EnterWithScope() final; + /*! + * \brief The method called when exiting RAII scope. + * \sa tvm::support::With + */ + void ExitWithScope() final; +}; + +/*! + * \brief Managed reference to ElseFrameNode. + * + * \sa ElseFrameNode + */ +class ElseFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(ElseFrame, TIRFrame, ElseFrameNode); +}; + +class DeclBufferFrameNode : public TIRFrameNode { + public: + tvm::tir::Buffer buffer; + + void VisitAttrs(tvm::AttrVisitor* v) { + TIRFrameNode::VisitAttrs(v); + v->Visit("buffer", &buffer); + } + + static constexpr const char* _type_key = "script.ir_builder.tir.DeclBufferFrame"; + TVM_DECLARE_FINAL_OBJECT_INFO(DeclBufferFrameNode, TIRFrameNode); + + public: + void ExitWithScope() final; +}; + +class DeclBufferFrame : public TIRFrame { + public: + TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(DeclBufferFrame, TIRFrame, DeclBufferFrameNode); +}; + } // namespace tir } // namespace ir_builder } // namespace script diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h index ec1f7f3753d1..dd289b691502 100644 --- a/include/tvm/script/ir_builder/tir/ir.h +++ b/include/tvm/script/ir_builder/tir/ir.h @@ -28,6 +28,7 @@ namespace script { namespace ir_builder { namespace tir { +using tvm::runtime::NDArray; using tvm::tir::Buffer; using tvm::tir::Var; @@ -317,6 +318,87 @@ LetFrame Let(Var var, PrimExpr value); */ RealizeFrame Realize(tvm::tir::BufferRegion buffer_slice, String storage_scope, PrimExpr condition); +/*! + * \brief The allocate node. + * \param extents The extents of the allocate. + * \param dtype The data type of the buffer. + * \param storage_scope The storage scope. + * \param condition The condition. + * \param annotations Additional annotation hints. + * \return The created AllocateFrame. + */ +AllocateFrame Allocate(Array extents, DataType dtype, String storage_scope = "", + Optional condition = NullOpt, + Optional> annotations = NullOpt); + +/*! + * \brief The allocate constant node. + * \param data The data associated with the constant. + * \param dtype The data type of the buffer. + * \param extents The extents of the allocate. + * \param annotations Additional annotation hints. + * \return The created AllocateConstFrame. + */ +AllocateConstFrame AllocateConst( + NDArray data, DataType dtype, Array extents, + Map annotations = NullValue>()); + +/*! + * \brief Create an attribute. + * \param node The node to annotate the attribute. + * \param attr_key Attribute type key. + * \param value The value of the attribute. + * \return The result AttrFrame. + */ +AttrFrame Attr(ObjectRef node, String attr_key, PrimExpr value); + +/*! + * \brief Create a while loop. + * \param condition The termination condition of the loop. + * \return The result WhileFrame. + */ +WhileFrame While(PrimExpr condition); + +/*! + * \brief Create an if statement. + * \param condition The condition of if statement. + * \return The result IfFrame. + */ +IfFrame If(PrimExpr condition); + +/*! + * \brief Create a then. + * \return The result ThenFrame. + */ +ThenFrame Then(); + +/*! + * \brief Create an else. + * \return The result ElseFrame. + */ +ElseFrame Else(); + +/*! + * \brief The buffer declaration frame. + * \param shape The type of the buffer prior to flattening. + * \param dtype The data type in the content of the buffer. + * \param buffer_name The name of the buffer. + * \param data The pointer to the head of the data. + * \param strides The strides of each dimension. + * \param elem_offset The offset in terms of number of dtype elements (including lanes). + * \param storage_scope The optional storage scope of buffer data pointer. + * \param align The alignment requirement of data pointer in bytes. + * \param offset_factor The factor of elem_offset field. + * \param buffer_type The buffer type. + * \param axis_separators The separators between input axes when generating flattened output axes. + * \return The declared buffer. + */ +DeclBufferFrame DeclBuffer(Array shape, DataType dtype, String buffer_name, + Optional data, Optional> strides, + Optional elem_offset, String storage_scope, int align, + int offset_factor, String buffer_type, + Optional> axis_separators); + /*! * \brief Launch a thread. * \param var The iteration variable. @@ -332,6 +414,21 @@ LaunchThreadFrame LaunchThread(Var var, PrimExpr extent); */ Var EnvThread(String thread_tag); +/*! + * \brief Store data in a buffer. + * \param buffer The buffer. + * \param value The value to be stored. + * \param indices The indices location to be stored. + */ +void BufferStore(Buffer buffer, PrimExpr value, Array indices); + +/*! + * \brief The prefetch hint for a buffer + * \param buffer The buffer to be prefetched. + * \param bounds The bounds to be prefetched. + */ +void Prefetch(Buffer buffer, Array bounds); + /*! * \brief Evaluate the input expression. * \param value The input expression to evaluate. diff --git a/python/tvm/script/ir_builder/tir/frame.py b/python/tvm/script/ir_builder/tir/frame.py index 69bc5bfc9676..b9b50dfa9876 100644 --- a/python/tvm/script/ir_builder/tir/frame.py +++ b/python/tvm/script/ir_builder/tir/frame.py @@ -18,7 +18,7 @@ from typing import List, Union from tvm._ffi import register_object as _register_object -from tvm.tir import Var +from tvm.tir import Buffer, Var from ..base import IRBuilderFrame @@ -65,6 +65,52 @@ class RealizeFrame(TIRFrame): ... +@_register_object("script.ir_builder.tir.AllocateFrame") +class AllocateFrame(TIRFrame): + def __enter__(self) -> Buffer: + super().__enter__() + return self.buffer + + +@_register_object("script.ir_builder.tir.AllocateConstFrame") +class AllocateConstFrame(TIRFrame): + def __enter__(self) -> Buffer: + super().__enter__() + return self.buffer + + +@_register_object("script.ir_builder.tir.AttrFrame") +class AttrFrame(TIRFrame): + ... + + +@_register_object("script.ir_builder.tir.WhileFrame") +class WhileFrame(TIRFrame): + ... + + +@_register_object("script.ir_builder.tir.IfFrame") +class IfFrame(TIRFrame): + ... + + +@_register_object("script.ir_builder.tir.ThenFrame") +class ThenFrame(TIRFrame): + ... + + +@_register_object("script.ir_builder.tir.ElseFrame") +class ElseFrame(TIRFrame): + ... + + +@_register_object("script.ir_builder.tir.DeclBufferFrame") +class DeclBufferFrame(TIRFrame): + def __enter__(self) -> Buffer: + super().__enter__() + return self.buffer + + @_register_object("script.ir_builder.tir.LaunchThreadFrame") class LaunchThreadFrame(TIRFrame): ... diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py index 6db8f40c32c8..625e1291ff20 100644 --- a/python/tvm/script/ir_builder/tir/ir.py +++ b/python/tvm/script/ir_builder/tir/ir.py @@ -19,8 +19,10 @@ from numbers import Integral from typing import Any, Dict, List, Optional, Union, Tuple +import numpy as np # type: ignore from tvm.ir import Range, Type +from tvm.runtime import convert, ndarray from tvm.tir import ( Buffer, BufferLoad, @@ -32,6 +34,7 @@ StringImm, Var, ) +from tvm.tir import Ramp as ramp from . import _ffi_api, frame @@ -890,6 +893,217 @@ def realize( ) +def allocate( + extents: List[PrimExpr], + dtype: str, + scope: str = "", + condition: PrimExpr = None, + annotations=None, +) -> frame.AllocateFrame: + """Allocate node. + + Parameters + ---------- + extents : List[PrimExpr] + The extents of the allocate. + + dtype : str + The data type of the buffer. + + scope : str + The storage scope. + + condition : PrimExpr + The condition. + + annotations: Optional[Mapping[str, Object]] + Additional annotation hints. + """ + if isinstance(condition, bool): + condition = IntImm("bool", condition) + return _ffi_api.Allocate( # type: ignore[attr-defined] # pylint: disable=no-member + extents, dtype, scope, condition, annotations + ) + + +def allocate_const( + data: List[PrimExpr], + dtype: str, + extents: List[PrimExpr], + annotations=None, +) -> frame.AllocateConstFrame: + """Allocate constant node. + + Parameters + ---------- + data : List[PrimExpr] + The data associated with the constant. + + dtype : str + The data type of the buffer. + + extents : List[PrimExpr] + The extents of the allocate. + + annotations : Optional[Map] + Additional annotations about the allocation. + """ + + return _ffi_api.AllocateConst( # type: ignore[attr-defined] # pylint: disable=no-member + ndarray.array(np.asarray(data, dtype)), dtype, extents, annotations + ) + + +def attr(node: Any, attr_key: str, value: Union[PrimExpr, str]) -> frame.AttrFrame: + """Create an attribute node. + + Parameters + ---------- + node : Any + The node to annotate the attribute. + + attr_key : str + Attribute type key. + + value : Union[PrimExpr, str] + The value of the attribute. + + Returns + ------- + res : frame.AttrFrame + The result AttrFrame. + """ + node = convert(node) + value = convert(value) + return _ffi_api.Attr(node, attr_key, value) # type: ignore[attr-defined] # pylint: disable=no-member + + +def While(condition: PrimExpr) -> frame.WhileFrame: # pylint: disable=invalid-name + """Create a while node. + + Parameters + ---------- + condition : PrimExpr + The termination condition of the loop. + + Returns + ------- + res : frame.WhileFrame + The result WhileFrame. + """ + if isinstance(condition, bool): + condition = IntImm("bool", condition) + return _ffi_api.While(condition) # type: ignore[attr-defined] # pylint: disable=no-member + + +def If(condition: PrimExpr) -> frame.IfFrame: # pylint: disable=invalid-name + """Create an if node. + + Parameters + ---------- + condition : PrimExpr + The condition of if statement, executes the true branch if the condition is true, + otherwise jump into the false branch. + + Returns + ------- + res : frame.IfFrame + The result IfFrame. + """ + if isinstance(condition, bool): + condition = IntImm("bool", condition) + return _ffi_api.If(condition) # type: ignore[attr-defined] # pylint: disable=no-member + + +def Then() -> frame.ThenFrame: # pylint: disable=invalid-name + """Create a then. + + Returns + ------- + res : frame.ThenFrame + The result ThenFrame. + """ + return _ffi_api.Then() # type: ignore[attr-defined] # pylint: disable=no-member + + +def Else() -> frame.ElseFrame: # pylint: disable=invalid-name + """Create an else. + + Returns + ------- + res : frame.ElseFrame + The result ElseFrame. + """ + return _ffi_api.Else() # type: ignore[attr-defined] # pylint: disable=no-member + + +def decl_buffer( + shape, + dtype="float32", + data=None, + strides=None, + elem_offset=None, + scope="", + align=0, + offset_factor=0, + buffer_type="", + axis_separators=None, +) -> frame.DeclBufferFrame: + """Create a buffer declaration node. + + Parameters + ---------- + shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral] + The type of the buffer prior to flattening. + + dtype : str + The data type in the content of the buffer. + + data : Var + The pointer to the head of the data. + + strides : List[PrimExpr] + The strides of each dimension. + + elem_offset : PrimExpr + The offset in terms of number of dtype elements (including lanes). + + scope : str + The optional storage scope of buffer data pointer. + + align : int + The alignment requirement of data pointer in bytes. + + offset_factor : int + The factor of elem_offset field. + + buffer_type : str + The buffer type. + + axis_separators : List[int] + The separators between input axes when generating flattened output axes. + + Returns + ------- + res : frame.DeclBufferFrame + The result DeclBufferFrame. + """ + shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape + return _ffi_api.DeclBuffer( # type: ignore[attr-defined] # pylint: disable=no-member + shape, + dtype, + "", + data, + strides, + elem_offset, + scope, + align, + offset_factor, + buffer_type, + axis_separators, + ) + + def launch_thread( iter_var: IterVar, # pylint: disable=redefined-outer-name extent: PrimExpr, @@ -939,6 +1153,53 @@ def env_thread(thread_tag: str) -> IterVar: return _ffi_api.EnvThread(thread_tag) # type: ignore[attr-defined] # pylint: disable=no-member +def buffer_store(buffer: Buffer, value: PrimExpr, indices: List[Union[PrimExpr, slice]]) -> None: + """Buffer store node. + + Parameters + ---------- + buffer : Buffer + The buffer. + + value : PrimExpr + The value to be stored. + + indices : List[Union[PrimExpr, slice]] + The indices location to be stored. + """ + from tvm.arith import Analyzer # pylint: disable=import-outside-toplevel + + expr_indices = [] + for index in indices: + if isinstance(index, slice): + step = 1 if index.step is None else index.step + lanes = Analyzer().simplify((index.stop - index.start + step - 1) // step) + if lanes == 1: + expr_indices.append(index.start) + else: + expr_indices.append(ramp(index.start, step, int(lanes))) + else: + expr_indices.append(index) + if isinstance(value, bool) and buffer.dtype == "bool": + value = IntImm("bool", value) + return _ffi_api.BufferStore( # type: ignore[attr-defined] # pylint: disable=no-member + buffer, value, expr_indices + ) + + +def prefetch(buffer: Buffer, indices: List[PrimExpr]) -> None: + """The prefetch hint for a buffer. + + Parameters + ---------- + buffer : Buffer + The buffer to be prefetched. + indices : List[PrimExpr] + The indices of the buffer to extract. + """ + return _ffi_api.Prefetch(buffer, indices) # type: ignore[attr-defined] # pylint: disable=no-member + + def evaluate(value: PrimExpr) -> None: """Evaluate the input expression. @@ -1288,8 +1549,18 @@ def var(dtype, name="") -> Var: "Assert", "let", "realize", + "allocate", + "allocate_const", + "attr", + "While", + "If", + "Then", + "Else", + "decl_buffer", "launch_thread", "env_thread", + "buffer_store", + "prefetch", "evaluate", "int8", "int16", diff --git a/src/script/ir_builder/tir/frame.cc b/src/script/ir_builder/tir/frame.cc index 6c9459e6389c..aa9efa653f71 100644 --- a/src/script/ir_builder/tir/frame.cc +++ b/src/script/ir_builder/tir/frame.cc @@ -115,6 +115,76 @@ void LaunchThreadFrameNode::ExitWithScope() { AddToParent(tvm::tir::AttrStmt(iter_var, attr_key, extent, AsStmt(stmts))); } +void AllocateFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + AddToParent(tvm::tir::Allocate(buffer->data, buffer->dtype, buffer->shape, condition, + AsStmt(stmts), annotations)); +} + +void AllocateConstFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + AddToParent( + tvm::tir::AllocateConst(buffer->data, dtype, extents, data, AsStmt(stmts), annotations)); +} +void AttrFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + AddToParent(tvm::tir::AttrStmt(node, attr_key, value, AsStmt(stmts))); +} + +void WhileFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + AddToParent(tvm::tir::While(condition, AsStmt(stmts))); +} + +void IfFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + if (!stmts.empty()) { + LOG(FATAL) << "stmt within IfThenElse frame should be either in ThenFrame or ElseFrame"; + } + if (!then_stmts.defined()) { + LOG(FATAL) << "IfThenElse frame should have at least one then branch"; + } + AddToParent(tvm::tir::IfThenElse( + condition, AsStmt(then_stmts.value()), + else_stmts.defined() ? AsStmt(else_stmts.value()) : tvm::tir::Stmt(nullptr))); +} + +void ThenFrameNode::EnterWithScope() { + IfFrame frame = FindIfFrame("T.then_"); + if (frame->then_stmts.defined()) { + LOG(FATAL) << "ValueError: Duplicate then branch declaration, previous one is " + << frame->then_stmts.value(); + } + TIRFrameNode::EnterWithScope(); +} + +void ThenFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + FindIfFrame("T.then_")->then_stmts = stmts; +} + +void ElseFrameNode::EnterWithScope() { + IfFrame frame = FindIfFrame("T.else_"); + if (!frame->then_stmts.defined()) { + LOG(FATAL) << "The else branch should follow then branch"; + } + if (frame->else_stmts.defined()) { + LOG(FATAL) << "ValueError: Duplicate else branch declaration, previous one is " + << frame->else_stmts.value(); + } + TIRFrameNode::EnterWithScope(); +} + +void ElseFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + FindIfFrame("T.else_")->else_stmts = stmts; +} + +void DeclBufferFrameNode::ExitWithScope() { + TIRFrameNode::ExitWithScope(); + AddToParent(tvm::tir::DeclBuffer(buffer, AsStmt(stmts))); +} + TVM_REGISTER_NODE_TYPE(TIRFrameNode); TVM_REGISTER_NODE_TYPE(PrimFuncFrameNode); TVM_REGISTER_NODE_TYPE(BlockFrameNode); @@ -124,6 +194,14 @@ TVM_REGISTER_NODE_TYPE(AssertFrameNode); TVM_REGISTER_NODE_TYPE(LetFrameNode); TVM_REGISTER_NODE_TYPE(RealizeFrameNode); TVM_REGISTER_NODE_TYPE(LaunchThreadFrameNode); +TVM_REGISTER_NODE_TYPE(AllocateFrameNode); +TVM_REGISTER_NODE_TYPE(AllocateConstFrameNode); +TVM_REGISTER_NODE_TYPE(AttrFrameNode); +TVM_REGISTER_NODE_TYPE(WhileFrameNode); +TVM_REGISTER_NODE_TYPE(IfFrameNode); +TVM_REGISTER_NODE_TYPE(ThenFrameNode); +TVM_REGISTER_NODE_TYPE(ElseFrameNode); +TVM_REGISTER_NODE_TYPE(DeclBufferFrameNode); } // namespace tir } // namespace ir_builder diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc index 5951af298f62..28c3d69861fa 100644 --- a/src/script/ir_builder/tir/ir.cc +++ b/src/script/ir_builder/tir/ir.cc @@ -444,6 +444,63 @@ RealizeFrame Realize(tvm::tir::BufferRegion buffer_slice, String storage_scope, return RealizeFrame(n); } +AllocateFrame Allocate(Array extents, DataType dtype, String storage_scope, + Optional condition, Optional> annotations) { + ObjectPtr n = make_object(); + n->extents = extents; + n->dtype = dtype; + n->storage_scope = storage_scope; + n->condition = condition.value_or(tvm::Bool(true)); + n->annotations = annotations.value_or(Map()); + n->buffer = BufferDecl(extents, dtype, "", NullOpt, NullOpt, NullOpt, storage_scope, 0, 0, + "default", NullOpt); + return AllocateFrame(n); +} + +AllocateConstFrame AllocateConst(tvm::runtime::NDArray data, DataType dtype, + Array extents, Map annotations) { + ObjectPtr n = make_object(); + n->dtype = dtype; + n->extents = extents; + n->data = data; + n->annotations = annotations; + n->buffer = + BufferDecl(extents, dtype, "", NullOpt, NullOpt, NullOpt, "", 0, 0, "default", NullOpt); + return AllocateConstFrame(n); +} + +AttrFrame Attr(ObjectRef node, String attr_key, PrimExpr value) { + ObjectPtr n = make_object(); + n->node = node; + n->attr_key = attr_key; + n->value = value; + return AttrFrame(n); +} + +WhileFrame While(PrimExpr condition) { + ObjectPtr n = make_object(); + n->condition = condition; + return WhileFrame(n); +} + +IfFrame If(PrimExpr condition) { + ObjectPtr n = make_object(); + n->condition = condition; + n->then_stmts = NullOpt; + n->else_stmts = NullOpt; + return IfFrame(n); +} + +ThenFrame Then() { + ObjectPtr n = make_object(); + return ThenFrame(n); +} + +ElseFrame Else() { + ObjectPtr n = make_object(); + return ElseFrame(n); +} + Var EnvThread(String thread_tag) { IterVar iter_var(Range{nullptr}, Var("", DataType::Int(32)), tvm::tir::IterVarType::kThreadIndex, thread_tag); @@ -456,6 +513,25 @@ Var EnvThread(String thread_tag) { return var; } +void BufferStore(Buffer buffer, PrimExpr value, Array indices) { + AddToParent(tvm::tir::BufferStore(buffer, value, indices)); +} + +void Prefetch(Buffer buffer, Array bounds) { + AddToParent(tvm::tir::Prefetch(buffer, bounds)); +} + +DeclBufferFrame DeclBuffer(Array shape, DataType dtype, String buffer_name, + Optional data, Optional> strides, + Optional elem_offset, String storage_scope, int align, + int offset_factor, String buffer_type, + Optional> axis_separators) { + ObjectPtr n = make_object(); + n->buffer = BufferDecl(shape, dtype, buffer_name, data, strides, elem_offset, storage_scope, + align, offset_factor, buffer_type, axis_separators); + return DeclBufferFrame(n); +} + void Evaluate(PrimExpr value) { AddToParent(tvm::tir::Evaluate(value)); } using tvm::script::ir_builder::details::Namer; @@ -540,10 +616,20 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.Grid").set_body_typed(Grid); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Assert").set_body_typed(Assert); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Let").set_body_typed(Let); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Allocate").set_body_typed(Allocate); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.AllocateConst").set_body_typed(AllocateConst); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Realize").set_body_typed(Realize); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Attr").set_body_typed(Attr); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.While").set_body_typed(While); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.If").set_body_typed(If); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Then").set_body_typed(Then); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Else").set_body_typed(Else); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.DeclBuffer").set_body_typed(DeclBuffer); TVM_REGISTER_GLOBAL("script.ir_builder.tir.LaunchThread").set_body_typed(LaunchThread); TVM_REGISTER_GLOBAL("script.ir_builder.tir.EnvThread").set_body_typed(EnvThread); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.BufferStore").set_body_typed(BufferStore); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Prefetch").set_body_typed(Prefetch); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int8").set_body_typed(Int8); diff --git a/src/script/ir_builder/tir/utils.h b/src/script/ir_builder/tir/utils.h index c29fae1c65e9..733c975fad7e 100644 --- a/src/script/ir_builder/tir/utils.h +++ b/src/script/ir_builder/tir/utils.h @@ -88,6 +88,21 @@ inline BlockFrame FindBlockFrame(const String& method) { throw; } +/*! + * \brief Check whether the top frame in IRBuilder frame stack is IfFrame. + * \param method The method name to be printed when throwing exception. + * \return The top frame of IfFrame. + */ +inline IfFrame FindIfFrame(const String& method) { + if (Optional frame = IRBuilder::Current()->GetLastFrame()) { + return frame.value(); + } else { + LOG(FATAL) << "ValueError: IfThenElse frame not find. Please ensure '" << method + << "' is called under T.if_()"; + } + throw; +} + /*! * \brief Convert BufferLoad to BufferRegion. * \param buffer_load The BufferLoad. diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py index 7f2e6e1a4706..40e13a2fbe2f 100644 --- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py +++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py @@ -17,9 +17,11 @@ # pylint: disable=invalid-name, missing-docstring """Unittests for tvm.script.ir_builder.tir""" import pytest -import tvm.testing +import numpy as np import tvm +import tvm.testing from tvm import tir +from tvm.runtime import ndarray from tvm.script.ir_builder import tir as T from tvm.script.ir_builder import IRBuilder from tvm.ir.base import assert_structural_equal @@ -29,6 +31,7 @@ def test_ir_builder_tir_primfunc_base(): with IRBuilder() as ib: with T.prim_func(): T.evaluate(0) + # the prim_func generated by IRBuilder prim_func_actual = ib.get() @@ -41,6 +44,7 @@ def test_ir_builder_tir_primfunc_base(): preflattened_buffer_map=None, attrs=None, ) + # Check if the generated ir is expected assert_structural_equal(prim_func_actual, prim_func_expected, map_free_vars=True) @@ -58,6 +62,7 @@ def test_ir_builder_tir_primfunc_complete(): buffer_d = T.match_buffer(d, (64, 64), "int64") T.preflattened_buffer(e, (32, 32), "int8", data=e.data) T.evaluate(0) + # the prim_func generated by IRBuilder prim_func_actual = ib.get() @@ -83,6 +88,7 @@ def test_ir_builder_tir_primfunc_complete(): }, attrs=tvm.ir.make_node("DictAttrs", key="value"), ) + # Check if the generated ir is expected assert_structural_equal(prim_func_actual, prim_func_expected, map_free_vars=True) @@ -91,6 +97,7 @@ def test_ir_builder_tir_block_base(): with IRBuilder() as ib: with T.block("block"): T.evaluate(0) + # the block generated by IRBuilder block_realize_actual = ib.get() @@ -110,6 +117,7 @@ def test_ir_builder_tir_block_base(): predicate=True, block=block_expected, ) + # Check if the generated ir is expected assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True) @@ -131,6 +139,7 @@ def test_ir_builder_tir_block_complete(): T.match_buffer(e[0:32, 0:32], (32, 32), "float32") T.axis.spatial(128, f) T.evaluate(0) + # the block generated by IRBuilder block_realize_actual = ib.get() @@ -158,6 +167,7 @@ def test_ir_builder_tir_block_complete(): predicate=var_a > 1, block=block_expected, ) + # Check if the generated ir is expected assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True) @@ -201,6 +211,7 @@ def test_ir_builder_tir_axis(): predicate=True, block=block_expected, ) + # Check if the generated ir is expected assert_structural_equal(block_realize_actual, block_realize_expected, map_free_vars=True) @@ -256,6 +267,7 @@ def test_ir_builder_tir_for(): kind=tir.ForKind.SERIAL, body=parallel_expected, ) + # Check if the generated ir is expected assert_structural_equal(for_actual, for_expected, map_free_vars=True) @@ -271,20 +283,9 @@ def test_ir_builder_tir_assert(): assert_expected = tir.AssertStmt( T.var("int32", name="a") == 0, tir.StringImm("a is 0"), tir.Evaluate(0) ) - # Check if the generated ir is expected - assert_structural_equal(assert_actual, assert_expected, map_free_vars=True) - -def test_ir_builder_tir_evaluate(): - with IRBuilder() as ib: - T.evaluate(0) - # the evaluate generated by IRBuilder - eval_actual = ib.get() - - # the expected evaluate - eval_expected = tir.Evaluate(0) # Check if the generated ir is expected - assert_structural_equal(eval_actual, eval_expected, map_free_vars=True) + assert_structural_equal(assert_actual, assert_expected, map_free_vars=True) def test_ir_builder_tir_let(): @@ -296,6 +297,8 @@ def test_ir_builder_tir_let(): # the expected Let statement let_expected = tir.LetStmt(T.var("int32", name="a"), tir.IntImm("int32", 2), tir.Evaluate(0)) + + # Check if the generated ir is expected assert_structural_equal(let_actual, let_expected, map_free_vars=True) @@ -304,6 +307,8 @@ def test_ir_builder_tir_realize(): with IRBuilder() as ib: with T.realize(buffer_a[0:128, 0:128], "test_storage_scope", True): T.evaluate(0) + + # the buffer realization generated by IRBuilder realize_actual = ib.get() # the expected buffer realization @@ -313,6 +318,8 @@ def test_ir_builder_tir_realize(): expected_realize = tir.AttrStmt( buffer_a, "realize_scope", tir.StringImm("test_storage_scope"), buffer_realize ) + + # Check if the generated ir is expected assert_structural_equal(realize_actual, expected_realize, map_free_vars=True) @@ -322,12 +329,152 @@ def test_ir_builder_tir_thread(): brow = T.env_thread("blockIdx.y") with T.launch_thread(brow, 1): T.evaluate(0) + + # the prim_func generated by IRBuilder ir_actual = ib.get() + + # the expected prim_func iter_var = tir.IterVar((0, 1), "v", iter_type=1, thread_tag="blockIdx.y") attr_stmt = tir.AttrStmt(iter_var, "thread_extent", 1, tir.Evaluate(0)) func = tir.PrimFunc([], attr_stmt) + + # Check if the generated ir is expected assert_structural_equal(ir_actual, func, map_free_vars=True) +def test_ir_builder_tir_allocate(): + with IRBuilder() as ib: + with T.allocate([10], "float32", scope="local"): + T.evaluate(1) + + # the allocate generated by IRBuilder + ir_actual = ib.get() + + # the expected allocate + buffer_var = tir.Var("v", tvm.ir.PointerType(tvm.ir.PrimType("float32"), "local")) + ir_expected = tir.Allocate( + buffer_var, "float32", [10], tvm.tir.const(1, "uint1"), tir.Evaluate(1) + ) + + # Check if the generated ir is expected + assert_structural_equal(ir_actual, ir_expected, map_free_vars=True) + + +def test_ir_builder_tir_allocate_const(): + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + with IRBuilder() as ib: + with T.allocate_const(data, "int32", [10]): + T.evaluate(1) + + # the allocate const generated by IRBuilder + ir_actual = ib.get() + + # the expected allocate const + buffer_var = tir.Var("v", tvm.ir.PointerType(tvm.ir.PrimType("int32"))) + ir_expected = tir.AllocateConst( + buffer_var, "int32", [10], ndarray.array(np.asarray(data, "int32")), tir.Evaluate(1) + ) + + # Check if the generated ir is expected + assert_structural_equal(ir_actual, ir_expected, map_free_vars=True) + + +def test_ir_builder_tir_while(): + with IRBuilder() as ib: + with T.While(T.var("int32", "x") > 0): + T.evaluate(0) + + # the while generated by IRBuilder + ir_actual = ib.get() + + # the expected while + ir_expected = tir.While(tir.Var("x", "int32") > 0, tir.Evaluate(0)) + + # Check if the generated ir is expected + assert_structural_equal(ir_actual, ir_expected, map_free_vars=True) + + +def test_ir_builder_tir_if_then_else(): + with IRBuilder() as ib: + with T.If(T.var("int32", "c") < 12): + with T.Then(): + T.evaluate(T.int32(0)) + with T.Else(): + T.evaluate(T.int32(1)) + + # the if_then_else generated by IRBuilder + ir_actual = ib.get() + + # the expected if_then_else + ir_expected = tir.IfThenElse( + tir.Var("c", "int32") < 12, + tir.Evaluate(tir.IntImm("int32", 0)), + tir.Evaluate(tir.IntImm("int32", 1)), + ) + + # Check if the generated ir is expected + assert_structural_equal(ir_actual, ir_expected, map_free_vars=True) + + +def test_ir_builder_tir_buffer_store(): + buffer_a = T.buffer_decl((10, 10), "float32") + i = T.var("int32", "x") + with IRBuilder() as ib: + T.buffer_store(buffer_a, 0.1, [0, i]) + + # the buffer store generated by IRBuilder + ir_actual = ib.get() + + # the expected buffer store + ir_expected = tir.BufferStore(buffer_a, 0.1, [0, i]) + + # Check if the generated ir is expected + assert_structural_equal(ir_actual, ir_expected, map_free_vars=True) + + +def test_ir_builder_tir_prefetch(): + with IRBuilder() as ib: + buffer_a = T.buffer_decl((128, 128), "float32") + T.prefetch(buffer_a, []) + + # the prefetch generated by IRBuilder + ir_actual = ib.get() + + # the expected prefetch + ir_expected = tir.Prefetch(buffer_a, []) + + # Check if the generated ir is expected + assert_structural_equal(ir_actual, ir_expected, map_free_vars=True) + + +def test_ir_builder_tir_evaluate(): + with IRBuilder() as ib: + T.evaluate(0) + # the evaluate generated by IRBuilder + eval_actual = ib.get() + + # the expected evaluate + eval_expected = tir.Evaluate(0) + + # Check if the generated ir is expected + assert_structural_equal(eval_actual, eval_expected, map_free_vars=True) + + +def test_ir_builder_tir_decl_buffer(): + with IRBuilder() as ib: + with T.decl_buffer([128, 128], "float32"): + T.evaluate(0) + + # the decl_buffer generated by IRBuilder + ir_actual = ib.get() + + # the expected decl_buffer + buffer = T.buffer_decl((128, 128), "float32") + ir_expected = tir.DeclBuffer(buffer, tir.Evaluate(0)) + + # Check if the generated ir is expected + assert_structural_equal(ir_actual, ir_expected, map_free_vars=True) + + if __name__ == "__main__": tvm.testing.main() From 60cf692a63a22cd2698273c4945f037b4b22474b Mon Sep 17 00:00:00 2001 From: czh978 <41666381+czh978@users.noreply.github.com> Date: Mon, 19 Sep 2022 13:49:04 +0800 Subject: [PATCH 203/704] [Frontend][TFLite] fix detection_postprocess's non_max_suppression_attrs["force_suppress"] (#12593) * [Frontend][TFLite]fix detection_postprocess's non_max_suppression_attrs["force_suppress"] Since tvm only supports operators detection_postprocess use_regular_nms is false, which will suppress boxes that exceed the threshold regardless of the class when implementing NMS in tflite, in order for the results of tvm and tflite to be consistent, we need to set force_suppress to True. * [Frontend][TFLite]fix detection_postprocess's non_max_suppression_attrs[force_suppress] Added a test case that reproduces inconsistent results between tvm and tflite When the force_suppress is false,it will get a good result if you set the force_suppress as true --- python/tvm/relay/frontend/tflite.py | 2 +- tests/python/frontend/tflite/test_forward.py | 37 ++++++++++++++------ 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py index 6c68230e0ecc..a7e10ad72e55 100644 --- a/python/tvm/relay/frontend/tflite.py +++ b/python/tvm/relay/frontend/tflite.py @@ -3355,7 +3355,7 @@ def convert_detection_postprocess(self, op): non_max_suppression_attrs = {} non_max_suppression_attrs["return_indices"] = False non_max_suppression_attrs["iou_threshold"] = custom_options["nms_iou_threshold"] - non_max_suppression_attrs["force_suppress"] = False + non_max_suppression_attrs["force_suppress"] = True non_max_suppression_attrs["top_k"] = anchor_boxes non_max_suppression_attrs["max_output_size"] = custom_options["max_detections"] non_max_suppression_attrs["invalid_to_bottom"] = False diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py index deaef72e1d7f..7b2bd60d8a20 100644 --- a/tests/python/frontend/tflite/test_forward.py +++ b/tests/python/frontend/tflite/test_forward.py @@ -4311,13 +4311,8 @@ def test_forward_matrix_diag(): # ---------------- -def test_detection_postprocess(): - """Detection PostProcess""" - tf_model_file = tf_testing.get_workload_official( - "http://download.tensorflow.org/models/object_detection/" - "ssd_mobilenet_v2_quantized_300x300_coco_2019_01_03.tar.gz", - "ssd_mobilenet_v2_quantized_300x300_coco_2019_01_03/tflite_graph.pb", - ) +def _test_detection_postprocess(tf_model_file, box_encodings_size, class_predictions_size): + """One iteration of detection postProcess with given model and shapes""" converter = tf.lite.TFLiteConverter.from_frozen_graph( tf_model_file, input_arrays=["raw_outputs/box_encodings", "raw_outputs/class_predictions"], @@ -4328,16 +4323,16 @@ def test_detection_postprocess(): "TFLite_Detection_PostProcess:3", ], input_shapes={ - "raw_outputs/box_encodings": (1, 1917, 4), - "raw_outputs/class_predictions": (1, 1917, 91), + "raw_outputs/box_encodings": box_encodings_size, + "raw_outputs/class_predictions": class_predictions_size, }, ) converter.allow_custom_ops = True converter.inference_type = tf.lite.constants.FLOAT tflite_model = converter.convert() np.random.seed(0) - box_encodings = np.random.uniform(size=(1, 1917, 4)).astype("float32") - class_predictions = np.random.uniform(size=(1, 1917, 91)).astype("float32") + box_encodings = np.random.uniform(size=box_encodings_size).astype("float32") + class_predictions = np.random.uniform(size=class_predictions_size).astype("float32") tflite_output = run_tflite_graph(tflite_model, [box_encodings, class_predictions]) tvm_output = run_tvm_graph( tflite_model, @@ -4382,6 +4377,26 @@ def test_detection_postprocess(): ) +def test_detection_postprocess(): + """Detection PostProcess""" + box_encodings_size = (1, 1917, 4) + class_predictions_size = (1, 1917, 91) + tf_model_file = tf_testing.get_workload_official( + "http://download.tensorflow.org/models/object_detection/" + "ssd_mobilenet_v2_quantized_300x300_coco_2019_01_03.tar.gz", + "ssd_mobilenet_v2_quantized_300x300_coco_2019_01_03/tflite_graph.pb", + ) + _test_detection_postprocess(tf_model_file, box_encodings_size, class_predictions_size) + + box_encodings_size = (1, 2034, 4) + class_predictions_size = (1, 2034, 91) + tf_model_file = download_testdata( + "https://github.com/czh978/models_for_tvm_test/raw/main/tflite_graph_with_postprocess.pb", + "tflite_graph_with_postprocess.pb", + ) + _test_detection_postprocess(tf_model_file, box_encodings_size, class_predictions_size) + + ####################################################################### # Custom Converter # ---------------- From 2af9b90ec191424724842795c552d4c15682eb8c Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Mon, 19 Sep 2022 08:20:33 -0500 Subject: [PATCH 204/704] [TIR] Implement API for padded layout transformations (#12720) Implementation of API in `tvm.tir.schedule` for layout transformations with padding, as part of https://github.com/apache/tvm/issues/12261, item "Insert pad value into generated TIR, using `tir::if_then_else`, `builtin::assume`, and `builtin::undef`". Following the RFC discussion in https://github.com/apache/tvm-rfcs/pull/77#issuecomment-1170294348 and https://github.com/apache/tvm-rfcs/pull/77#issuecomment-1171290053, this commit preferentially rewrites the loops that surround a padded transformation where possible, in order to express padding in terms of `tir::if_then_else`. --- include/tvm/tir/schedule/schedule.h | 17 +- python/tvm/tir/function.py | 46 +- python/tvm/tir/schedule/_type_checker.py | 2 +- python/tvm/tir/schedule/schedule.py | 42 +- python/tvm/tir/tensor_intrin/cuda.py | 2 +- src/meta_schedule/postproc/rewrite_layout.cc | 3 +- .../multi_level_tiling_tensor_core.cc | 2 +- src/tir/ir/index_map.cc | 2 +- src/tir/schedule/concrete_schedule.cc | 6 +- src/tir/schedule/concrete_schedule.h | 2 +- src/tir/schedule/instruction_traits.h | 4 +- src/tir/schedule/primitive.h | 4 +- .../primitive/layout_transformation.cc | 910 +++++++++++++++++- src/tir/schedule/schedule.cc | 6 +- src/tir/schedule/traced_schedule.cc | 15 +- src/tir/schedule/traced_schedule.h | 2 +- .../test_tir_schedule_transform_layout.py | 410 ++++++++ 17 files changed, 1408 insertions(+), 67 deletions(-) diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h index 8e5cd34d2e0b..049f063240df 100644 --- a/include/tvm/tir/schedule/schedule.h +++ b/include/tvm/tir/schedule/schedule.h @@ -601,9 +601,24 @@ class ScheduleNode : public runtime::Object { * \param buffer_index The index of the buffer in block's read or write region. * \param buffer_index_type The type of the buffer index, kRead or kWrite. * \param index_map The transformation to apply. + * + * \param pad_value The value to write into padding introduced by + * the transformation. If the schedule contains a producer block + * for the specified buffer, the pad value will be written as + * part of the producer block if possible, or after the producer + * block otherwise. Otherwise, if the buffer is an input, will + * insert an annotation block to state that the padding contains + * the known value. + * + * Note: If applied to an input buffer, the calling scope is + * responsible for ensuring that the pad_value is present. + * Algebraic symplifications, branch elimination, and other + * optimizations may assume that this precondition is met, and + * may result in incorrect results being returned. */ virtual void TransformLayout(const BlockRV& block_rv, int buffer_index, - BufferIndexType buffer_index_type, const IndexMap& index_map) = 0; + BufferIndexType buffer_index_type, const IndexMap& index_map, + const Optional& pad_value = NullOpt) = 0; /*! * \brief Apply a transformation represented by IndexMap to block diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py index e525fc2cc31a..df39f8aebf71 100644 --- a/python/tvm/tir/function.py +++ b/python/tvm/tir/function.py @@ -308,8 +308,9 @@ def from_func( The function to map from source indices to target indices. The function should accept `tir.Var` parameters and return - a list. Each element of the returned list should be a - `tir.PrimExpr`. + a either a `tir.PrimExpr`, or a list of `tir.PrimExpr`. + Returning a `tir.PrimExpr` is equivalent to returning a + list of length 1 containing that `tir.PrimExpr`. ndim: Optional[int] @@ -356,9 +357,12 @@ def from_func_with_separators( mapping_function : Callable The function to map from source indices to target indices. - The function should accept tir.Var parameters and return a - list. Each element of the returned list should be either a - `tir.PrimExpr` or the object `IndexMap.AXIS_SEPARATOR`. + The function should accept tir.Var parameters and return + either a `tir.PrimExpr` or a list. Each element of the + returned list should be either a `tir.PrimExpr` or the + object `IndexMap.AXIS_SEPARATOR`. Returning a + `tir.PrimExpr` is equivalent to returning a list of length + 1 containing that `tir.PrimExpr`. ndim: Optional[int] @@ -423,17 +427,27 @@ def from_func_with_separators( final_indices = [] axis_separators = [] - for val in mapping: - if isinstance(val, tvm.ir.PrimExpr): - final_indices.append(val) - elif val is IndexMap.AXIS_SEPARATOR: - axis_separators.append(len(final_indices)) - else: - raise TypeError( - "Expected mapping function to return list of " - "either tvm.ir.PrimExpr or IndexMap.AXIS_SEPARATOR. " - f"Instead received {val} of type {type(val)}." - ) + + try: + iter(mapping) + is_iterable = True + except TypeError: + is_iterable = False + + if is_iterable: + for val in mapping: + if isinstance(val, tvm.ir.PrimExpr): + final_indices.append(val) + elif val is IndexMap.AXIS_SEPARATOR: + axis_separators.append(len(final_indices)) + else: + raise TypeError( + "Expected mapping function to return list of " + "either tvm.ir.PrimExpr or IndexMap.AXIS_SEPARATOR. " + f"Instead received {val} of type {type(val)}." + ) + else: + final_indices.append(mapping) return IndexMap(initial_indices, final_indices, inverse_index_map), axis_separators diff --git a/python/tvm/tir/schedule/_type_checker.py b/python/tvm/tir/schedule/_type_checker.py index 0b48dfc2b0e6..0c66f7ef6cdf 100644 --- a/python/tvm/tir/schedule/_type_checker.py +++ b/python/tvm/tir/schedule/_type_checker.py @@ -164,7 +164,7 @@ def _dispatcher(type_: Any) -> Tuple[str, List[type]]: return "atomic", [type_] -def callable_str(subtypes): +def callable_str(*subtypes): if subtypes: *arg_types, return_type = subtypes arg_str = ", ".join(_type2str(arg_type) for arg_type in arg_types) diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py index fdc871703275..b8f696b7a134 100644 --- a/python/tvm/tir/schedule/schedule.py +++ b/python/tvm/tir/schedule/schedule.py @@ -2443,6 +2443,7 @@ def transform_layout( block: Union[BlockRV, str], buffer: Union[Tuple[str, int], str, Buffer], index_map: Union[IndexMap, Callable], + pad_value: Optional[Union[int, float, IndexMap, Callable]] = None, ) -> None: """Apply a transformation represented by IndexMap to buffer @@ -2479,6 +2480,36 @@ def transform_layout( primitive will be called in addition to the TransformLayout primitive. + pad_value: Optional[Union[int, float, PrimExpr, IndexMap, Callable]] + + The value to be used for any padding introduced by the + transformation. If the schedule contains a producer block + for the specified buffer, the pad value will be written as + part of the producer block if possible, or after the producer + block otherwise. Otherwise, if the buffer is an input, will + insert an annotation block to state that the padding contains + the known value. + + The pad value may not contain instances of BufferLoad, + except where it loads a value from the buffer being + transformed (e.g. to create a circular buffer with + padding that consists of repeated elements). + + Note: If applied to an input buffer, the calling scope is + responsible for ensuring that the pad_value is present. + Algebraic symplifications, branch elimination, and other + optimizations may assume that this precondition is met, and + may result in incorrect results being returned. + + If None, the transformation may not introduce padding. + + If an int, float or PrimExpr, the transformation is the + specific value to be present in the padding. + + If an IndexMap or Callable, the transformation is the + value to be present in the padding in terms of the + transformed index. + Examples -------- Before transform_layout, in TensorIR, the IR is: @@ -2536,9 +2567,18 @@ def two_elementwise_transformed_intermediate_buffer(a: T.handle, c: T.handle) -> else: axis_separators = [] + if pad_value is None: + pass + elif callable(pad_value): + pad_value = IndexMap.from_func(pad_value, ndim=len(index_map.final_indices)) + elif not isinstance(pad_value, IndexMap): + pad_value = IndexMap.from_func( + lambda *indices: pad_value, ndim=len(index_map.final_indices) + ) + buffer_index_type_enum = 0 if buffer_index_type == "read" else 1 _ffi_api.ScheduleTransformLayout( # type: ignore # pylint: disable=no-member - self, block, buffer_index, buffer_index_type_enum, index_map + self, block, buffer_index, buffer_index_type_enum, index_map, pad_value ) if axis_separators: _ffi_api.ScheduleSetAxisSeparator( # type: ignore # pylint: disable=no-member diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py index 64d7c24840ae..a309b091285b 100644 --- a/python/tvm/tir/tensor_intrin/cuda.py +++ b/python/tvm/tir/tensor_intrin/cuda.py @@ -36,7 +36,7 @@ def shared_16x32_to_ldmatrix_32x16_layout(i, j): def shared_32x16_to_ldmatrix_32x16_layout(i, j): - thread_id = (i % 4) + 4 * (j % 8) + thread_id = (i % 16) // 4 + 4 * (j % 8) return thread_id, 8 * (j // 8) + (i // 16) * 4 + i % 4 diff --git a/src/meta_schedule/postproc/rewrite_layout.cc b/src/meta_schedule/postproc/rewrite_layout.cc index 6ff9958c791f..998b22b57463 100644 --- a/src/meta_schedule/postproc/rewrite_layout.cc +++ b/src/meta_schedule/postproc/rewrite_layout.cc @@ -148,7 +148,8 @@ bool RewriteLayout(const Schedule& sch) { // Apply schedule BlockRV block_rv = sch->GetBlock(block->name_hint, func_name); BlockRV cached_block_rv = sch->CacheRead(block_rv, buffer_index, "global"); - sch->TransformLayout(block_rv, buffer_index, BufferIndexType::kRead, index_map.value()); + sch->TransformLayout(block_rv, buffer_index, BufferIndexType::kRead, index_map.value(), + NullOpt); sch->Annotate(cached_block_rv, attr::meta_schedule_layout_rewrite_preproc, const_true()); } } diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc index 8fcb8fe503b7..6759b59a3245 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc @@ -499,7 +499,7 @@ Optional MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin( const tir::BufferRegion& reindexed_buffer_region = tir::GetNthAccessBufferRegion( state->sch->state(), GetRef(block), buffer_index, index_type); auto sub_index_map = f_get_sub_index_map(lhs_buffer, reindexed_buffer_region->region); - state->sch->TransformLayout(state->block_rv, buffer_index, index_type, sub_index_map); + state->sch->TransformLayout(state->block_rv, buffer_index, index_type, sub_index_map, NullOpt); }; for (int i = 0, n = block_before_reindex->reads.size(); i < n; ++i) { diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc index cceff72ec82f..64c5d5d5ddde 100644 --- a/src/tir/ir/index_map.cc +++ b/src/tir/ir/index_map.cc @@ -93,7 +93,7 @@ std::pair IndexMap::NonSurjectiveInverse(Array initia // Unpack the map to an array, maintaining the same parameter order. Array inverse_exprs; for (const auto& index : (*this)->initial_indices) { - inverse_exprs.push_back(inverse_exprs_map.at(index)); + inverse_exprs.push_back(analyzer.Simplify(inverse_exprs_map.at(index))); } PrimExpr padding_predicate = padded_iter_map->padding_predicate; diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc index 9d7dc6b95f50..4558ad04baed 100644 --- a/src/tir/schedule/concrete_schedule.cc +++ b/src/tir/schedule/concrete_schedule.cc @@ -761,9 +761,11 @@ void ConcreteScheduleNode::Unannotate(const BlockRV& block_rv, const String& ann /******** Schedule: Layout transformation ********/ void ConcreteScheduleNode::TransformLayout(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type, - const IndexMap& index_map) { + const IndexMap& index_map, + const Optional& pad_value) { TVM_TIR_SCHEDULE_BEGIN(); - tir::TransformLayout(state_, this->GetSRef(block_rv), buffer_index, buffer_index_type, index_map); + tir::TransformLayout(state_, this->GetSRef(block_rv), buffer_index, buffer_index_type, index_map, + pad_value); this->state_->DebugVerify(); TVM_TIR_SCHEDULE_END("transform_layout", this->error_render_level_); } diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h index 1aa9dafcc93e..59a9e3752859 100644 --- a/src/tir/schedule/concrete_schedule.h +++ b/src/tir/schedule/concrete_schedule.h @@ -144,7 +144,7 @@ class ConcreteScheduleNode : public ScheduleNode { void Unannotate(const BlockRV& block_rv, const String& ann_key) override; /******** Schedule: Layout transformation ********/ void TransformLayout(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type, - const IndexMap& index_map) override; + const IndexMap& index_map, const Optional& pad_value) override; void TransformBlockLayout(const BlockRV& block_rv, const IndexMap& index_map) override; void SetAxisSeparator(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type, diff --git a/src/tir/schedule/instruction_traits.h b/src/tir/schedule/instruction_traits.h index 56c69224fe17..122c5ff0d9fe 100644 --- a/src/tir/schedule/instruction_traits.h +++ b/src/tir/schedule/instruction_traits.h @@ -430,7 +430,9 @@ TVM_ALWAYS_INLINE Array UnpackedInstTraits::_ConvertOutputs( /********** PythonAPICall **********/ inline void PythonAPICall::AsPythonString(const ObjectRef& obj, std::ostream& os) { - if (const auto* str = obj.as()) { + if (!obj.defined()) { + os << "None"; + } else if (const auto* str = obj.as()) { os << str->data; } else if (const auto* int_imm = obj.as()) { os << int_imm->value; diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h index 97233fe4bc6f..21388ff132ae 100644 --- a/src/tir/schedule/primitive.h +++ b/src/tir/schedule/primitive.h @@ -474,9 +474,11 @@ TVM_DLL void Unannotate(ScheduleState self, const StmtSRef& sref, const String& * \param buffer_index The index of the buffer in block's read or write region. * \param buffer_index_type The type of the buffer index, kRead or kWrite. * \param index_map The transformation to apply. + * \param pad_value The value to write into padding introduced by the transformation. */ TVM_DLL void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_index, - BufferIndexType buffer_index_type, const IndexMap& index_map); + BufferIndexType buffer_index_type, const IndexMap& index_map, + const Optional& pad_value); /*! * \brief Apply a transformation represented by IndexMap to block diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc index 32ed279f028f..025723e1793d 100644 --- a/src/tir/schedule/primitive/layout_transformation.cc +++ b/src/tir/schedule/primitive/layout_transformation.cc @@ -16,12 +16,647 @@ * specific language governing permissions and limitations * under the License. */ + +#include +#include + #include "../../../arith/ir_mutator_with_analyzer.h" #include "../utils.h" namespace tvm { namespace tir { +/*! \brief Planning stage prior to rewriting in TransformLayoutRewriter + * + * There are four ways that transformation may be handled. Each + * updates the buffer shape and the indices used to acces the buffer + * in BufferStore/BufferLoad nodes, but differ in how they handle the + * `pad_value`. In order of preference, the different strategies are + * as follows: + * + * 1. NoPaddingRequired. The transformation does not introduce + * padding, so only local changes to update the indices of + * BufferLoad/BufferStore nodes are required. No blocks are added, + * removed, or replaced. + * + * 2. ProloguePlan. The transformation introduces padding, but the + * analyzed block has no write stages for the transformed buffer. + * This buffer is an input and the caller is responsible for ensuring + * that the padding contains the specified `pad_value`. The generated + * prologue contains `builtin::assume()` calls that will expose this + * known value during scheduling/simplification, but will be removed + * during lowering. + * + * 3. ReplacementPlan. The transformation introduces padding, has at + * least one write stage for the transformed buffer, and at least one + * of those write stages writes to all pre-transformation indices + * following a row-major traversal. These write stage is rewritten to + * be row-major traversals of the post-transformation indices, with a + * `tir::if_then_else` call to write either the specified `pad_value` + * into padding or the computed value into non-padding. + * + * 4. EpiloguePlan. The transformation introduces padding, has at + * least one write stage for the transformed buffer, but no write + * stage can be rewritten to use `tir::if_then_else`. The + * transformation still requires the `pad_value` to be written into + * the padding, so a new block is inserted after the last write stage + * to explicitly fill the padding. + * + */ +class TransformLayoutPlanner : private StmtExprVisitor { + public: + // Statement to be inserted prior to the analyzed block + struct ProloguePlan { + Stmt prologue; + }; + + // Loops within the analyzed block that should be replaced + struct ReplacementPlan { + Map replacements; + Map block_sref_reuse; + }; + + // The block to be inserted, along with the location at which it + // should be inserted. The location will be either a For or a + // Block, and will be after all writes the transformed buffer. + struct EpiloguePlan { + Stmt insert_after; + Stmt new_block; + }; + + struct NoPaddingRequired {}; + + using TransformPlan = + std::variant; + + static TransformPlan Plan(Block block, Buffer old_buffer, Buffer new_buffer, IndexMap index_map, + IndexMap inverse, PrimExpr padding_predicate, + Optional pad_value) { + ICHECK(!pad_value.defined() || pad_value.value()->final_indices.size() == 1) + << "Internal error: Should be caught by ScheduleError checks prior to this point"; + TransformLayoutPlanner visitor(old_buffer); + visitor(block); + return visitor.Finalize(new_buffer, index_map, inverse, padding_predicate, pad_value); + } + + private: + explicit TransformLayoutPlanner(Buffer old_buffer) : old_buffer_(old_buffer) {} + + void VisitStmt_(const ForNode* op) override { + BindLoopVar context(this, GetRef(op)); + StmtExprVisitor::VisitStmt_(op); + } + + void VisitStmt_(const LetStmtNode* op) override { + BindVariableDefinition context(this, op->var, op->value); + StmtExprVisitor::VisitStmt_(op); + } + + void VisitStmt_(const BlockRealizeNode* op) override { + BindBlockRealize context(this, GetRef(op)); + StmtExprVisitor::VisitStmt_(op); + } + + void VisitStmt_(const BufferStoreNode* op) override { + if (!op->buffer.same_as(old_buffer_)) { + return; + } + + std::optional> loop_dependency_range = std::nullopt; + for (const auto& index : op->indices) { + if (auto index_depth = LoopDependencyRange(index); index_depth.has_value()) { + if (loop_dependency_range) { + loop_dependency_range = { + std::min(loop_dependency_range.value().first, index_depth.value().first), + std::max(loop_dependency_range.value().second, index_depth.value().second)}; + } else { + loop_dependency_range = index_depth; + } + } + } + + WriteInfo write_info; + write_info.store = GetRef(op); + if (loop_dependency_range) { + size_t i = loop_dependency_range.value().first; + size_t j = loop_dependency_range.value().second; + ICHECK_LT(i, active_loops_.size()); + ICHECK_LT(j, active_loops_.size()); + + write_info.dependent_loopnest = {active_loops_.begin() + i, active_loops_.begin() + j + 1}; + } + write_info.innermost_block_realize = innermost_block_realize_; + + write_info.contains_row_major_traversal = [&]() -> bool { + const auto& loopnest = write_info.dependent_loopnest; + if (loopnest.empty()) { + return false; + } + + if (loopnest.size() != old_buffer_->shape.size() || loopnest.size() != op->indices.size()) { + return false; + } + + for (size_t i = 0; i < loopnest.size(); i++) { + const For& loop = loopnest[i]; + const PrimExpr& buffer_dim = old_buffer_->shape[i]; + PrimExpr index = Substitute(op->indices[i], active_var_bindings_); + bool is_loop_over_axis = index.same_as(loop->loop_var) && is_const_int(loop->min, 0) && + ExprDeepEqual()(loop->extent, buffer_dim) && + loop->kind == ForKind::kSerial; + if (!is_loop_over_axis) { + return false; + } + } + + return true; + }(); + + write_info_.push_back(write_info); + + // Don't need to continue recursing, as the entire goal was to + // find the BufferStore. + } + + std::optional> LoopDependencyRange(const PrimExpr& expr) const { + std::optional> prev = std::nullopt; + for (const auto& var : UndefinedVars(expr)) { + auto it = loop_depth_lookup_.find(var.get()); + if (it != loop_depth_lookup_.end()) { + if (prev.has_value()) { + prev = {std::min(prev.value().first, it->second.first), + std::max(prev.value().second, it->second.second)}; + } else { + prev = it->second; + } + } + } + + return prev; + } + + class BufferStoreReplacer : public StmtExprMutator { + public: + BufferStoreReplacer(std::function(const BufferStoreNode*)> replace_store, + std::function(const BlockRealizeNode*, const BlockRealize&)> + replace_block_realize) + : replace_store_(replace_store), replace_block_realize_(replace_block_realize) {} + + Stmt VisitStmt_(const BufferStoreNode* op) final { + if (auto replacement = replace_store_(op)) { + auto store = Downcast(replacement.value()); + return StmtExprMutator::VisitStmt_(store.get()); + } else { + return StmtExprMutator::VisitStmt_(op); + } + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + auto realize = Downcast(StmtExprMutator::VisitStmt_(op)); + if (auto replacement = replace_block_realize_(op, realize)) { + return replacement.value(); + } else { + return std::move(realize); + } + } + + private: + std::function(const BufferStoreNode*)> replace_store_; + std::function(const BlockRealizeNode*, const BlockRealize&)> + replace_block_realize_; + }; + + TransformPlan Finalize(Buffer new_buffer, IndexMap index_map, IndexMap inverse, + PrimExpr padding_predicate, Optional pad_value) const { + if (auto prologue_plan = + FinalizeProloguePlan(new_buffer, index_map, inverse, padding_predicate, pad_value); + prologue_plan.has_value()) { + return prologue_plan.value(); + } else if (auto replacement_plan = FinalizeReplacementPlan(new_buffer, index_map, inverse, + padding_predicate, pad_value); + replacement_plan.has_value()) { + return replacement_plan.value(); + } else if (auto epilogue_plan = FinalizeEpiloguePlan(new_buffer, index_map, inverse, + padding_predicate, pad_value); + epilogue_plan.has_value()) { + return epilogue_plan.value(); + } else { + return NoPaddingRequired(); + } + } + + std::optional FinalizeProloguePlan(Buffer new_buffer, IndexMap index_map, + IndexMap inverse, PrimExpr padding_predicate, + Optional pad_value) const { + if (write_info_.size() || is_zero(padding_predicate) || !pad_value.defined()) { + return std::nullopt; + } + + Array iter_vars; + Array iter_values; + Array indices; + Map loop_indices_to_block_indices; + ICHECK_EQ(inverse->initial_indices.size(), new_buffer->shape.size()); + for (size_t i = 0; i < inverse->initial_indices.size(); i++) { + const auto& loop_var = inverse->initial_indices[i]; + const auto& dim = new_buffer->shape[i]; + Var block_var("v_" + loop_var->name_hint, loop_var->dtype); + IterVar iter_var(Range(0, dim), block_var, kDataPar); + loop_indices_to_block_indices.Set(loop_var, block_var); + indices.push_back(iter_var->var); + iter_vars.push_back(iter_var); + iter_values.push_back(loop_var); + } + padding_predicate = Substitute(std::move(padding_predicate), loop_indices_to_block_indices); + + PrimExpr pad_value_at_index = pad_value.value()->MapIndices(indices)[0]; + PrimExpr expr = (!padding_predicate) || (BufferLoad(new_buffer, indices) == pad_value_at_index); + Stmt stmt = Evaluate(Call(DataType::Bool(), builtin::assume(), {expr})); + + std::stringstream block_name; + block_name << "buffer_" << new_buffer->name << "_assumptions"; + auto read_region = BufferRegion::FromPoint(new_buffer, indices); + stmt = BlockRealize(iter_values, Bool(true), + Block(iter_vars, {read_region}, {}, block_name.str(), stmt)); + + for (size_t rev_i = 0; rev_i < inverse->initial_indices.size(); rev_i++) { + size_t i = (inverse->initial_indices.size() - 1) - rev_i; + Var loop_var = inverse->initial_indices[i]; + PrimExpr extent = new_buffer->shape[i]; + stmt = For(loop_var, 0, extent, ForKind::kSerial, stmt); + } + return ProloguePlan{stmt}; + } + + std::optional FinalizeReplacementPlan(Buffer new_buffer, IndexMap index_map, + IndexMap inverse, + PrimExpr padding_predicate, + Optional pad_value) const { + if (write_info_.empty() || is_zero(padding_predicate) || !pad_value.defined()) { + return std::nullopt; + } + + auto generate_if_then_else_block = [&](const WriteInfo& info) -> Optional { + if (!info.contains_row_major_traversal || !pad_value.defined() || + is_zero(padding_predicate)) { + return NullOpt; + } + + Array old_indices = info.store->indices; + PrimExpr if_then_else_condition = padding_predicate; + Array new_indices; + for (const auto& var : inverse->initial_indices) { + new_indices.push_back(var); + } + + auto replace_block_realize = + [&]() -> std::function(const BlockRealizeNode*, const BlockRealize&)> { + auto no_change = [](const BlockRealizeNode*, const BlockRealize&) -> Optional { + return NullOpt; + }; + if (!info.innermost_block_realize) { + return no_change; + } + if (old_indices.empty()) { + return no_change; + } + + BlockRealize block_realize = info.innermost_block_realize.value(); + const auto& block = block_realize->block; + + // Find the block iterators that are used to access the buffer. Must be in the same order + // as they appear in the indices. + if (block->iter_vars.size() < old_indices.size()) { + return no_change; + } + const auto& iter_vars = block->iter_vars; + size_t block_index_start = 0; + for (; block_index_start < iter_vars.size() - old_indices.size(); block_index_start++) { + if (old_indices[0].same_as(iter_vars[block_index_start]->var)) { + break; + } + } + if (block_index_start > iter_vars.size() - old_indices.size()) { + return no_change; + } + + for (size_t i = 0; i < old_indices.size(); i++) { + if (!old_indices[i].same_as(iter_vars[block_index_start + i]->var) || + iter_vars[block_index_start + i]->iter_type != kDataPar) { + return no_change; + } + } + + // If we got to this point, all indices used to access the + // buffer are virtual indices defined in the innermost block. + // Therefore, generate new virtual indices for iterating over + // the post-transform buffer. + Array new_iter_values; // For BlockRealize + Array new_iter_vars; // For Block + Array new_access_indices; // For BufferStore + Map loop_var_to_virtual_var; // For updating if_then_else_condition + + for (size_t i = 0; i < block_index_start; i++) { + new_iter_vars.push_back(iter_vars[i]); + new_iter_values.push_back(block_realize->iter_values[i]); + } + + ICHECK_EQ(inverse->initial_indices.size(), new_buffer->shape.size()); + for (size_t i = 0; i < inverse->initial_indices.size(); i++) { + Var var = inverse->initial_indices[i]; + PrimExpr dim = new_buffer->shape[i]; + std::stringstream ss; + ss << "v_" << var->name_hint; + Var virtual_var(ss.str(), var.dtype()); + new_iter_values.push_back(var); + new_iter_vars.push_back(IterVar(Range::FromMinExtent(0, dim), virtual_var, kDataPar)); + new_access_indices.push_back(virtual_var); + loop_var_to_virtual_var.Set(var, virtual_var); + } + + for (size_t i = block_index_start + old_indices.size(); i < iter_vars.size(); i++) { + new_iter_vars.push_back(iter_vars[i]); + new_iter_values.push_back(block_realize->iter_values[i]); + } + + Map old_virtual_var_to_new_virtual_var; + ICHECK_EQ(inverse->final_indices.size(), old_indices.size()); + for (size_t i = 0; i < old_indices.size(); i++) { + Var var = Downcast(old_indices[i]); + PrimExpr expr = Substitute(inverse->final_indices[i], loop_var_to_virtual_var); + old_virtual_var_to_new_virtual_var.Set(var, expr); + } + + if_then_else_condition = Substitute(if_then_else_condition, loop_var_to_virtual_var); + new_indices = new_access_indices; + + return [target_realize = info.innermost_block_realize, new_iter_vars, new_iter_values, + old_virtual_var_to_new_virtual_var](const BlockRealizeNode* op, + const BlockRealize& visited) -> Optional { + if (op == target_realize.get()) { + Block block = visited->block; + block = + Downcast(Substitute(std::move(block), old_virtual_var_to_new_virtual_var)); + block.CopyOnWrite()->iter_vars = new_iter_vars; + + BlockRealize realize = visited; + { + auto write_ptr = realize.CopyOnWrite(); + write_ptr->block = block; + write_ptr->iter_values = new_iter_values; + } + return realize; + } else { + return NullOpt; + } + }; + }(); + + bool all_stores_replaced = true; + auto replace_store = [&](const BufferStoreNode* op) -> Optional { + if (!op->buffer.same_as(info.store->buffer)) { + all_stores_replaced = false; + return NullOpt; + } + ICHECK_EQ(old_indices.size(), op->indices.size()); + ExprDeepEqual expr_equal; + for (size_t i = 0; i < old_indices.size(); i++) { + if (!expr_equal(old_indices[i], op->indices[i])) { + all_stores_replaced = false; + return NullOpt; + } + } + + PrimExpr pad_value_at_index = pad_value.value()->MapIndices(new_indices)[0]; + return BufferStore(new_buffer, + if_then_else(if_then_else_condition, pad_value_at_index, op->value), + new_indices); + }; + + BufferStoreReplacer replacer(replace_store, replace_block_realize); + Stmt stmt = replacer(info.dependent_loopnest.back()->body); + if (!all_stores_replaced) { + return NullOpt; + } + + std::unordered_map var_remap; + ICHECK_EQ(info.dependent_loopnest.size(), inverse->final_indices.size()); + for (size_t i = 0; i < info.dependent_loopnest.size(); i++) { + Var var = info.dependent_loopnest[i]->loop_var; + PrimExpr expr = inverse->final_indices[i]; + var_remap[var.get()] = expr; + } + stmt = Substitute(std::move(stmt), var_remap); + + ICHECK_EQ(inverse->initial_indices.size(), new_buffer->shape.size()); + for (size_t rev_i = 0; rev_i < inverse->initial_indices.size(); rev_i++) { + size_t i = (inverse->initial_indices.size() - 1) - rev_i; + Var loop_var = inverse->initial_indices[i]; + PrimExpr extent = new_buffer->shape[i]; + stmt = For(loop_var, 0, extent, ForKind::kSerial, stmt); + } + + return stmt; + }; + + Map loop_replacements; + + for (const auto& info : write_info_) { + if (info.dependent_loopnest.size()) { + if (auto opt_stmt = generate_if_then_else_block(info)) { + loop_replacements.Set(info.dependent_loopnest[0], opt_stmt.value()); + } + } + } + + if (loop_replacements.size()) { + return ReplacementPlan{std::move(loop_replacements)}; + } else { + return std::nullopt; + } + } + + std::optional FinalizeEpiloguePlan(Buffer new_buffer, IndexMap index_map, + IndexMap inverse, PrimExpr padding_predicate, + Optional pad_value) const { + if (write_info_.empty() || is_zero(padding_predicate) || !pad_value.defined()) { + return std::nullopt; + } + + Array iter_vars; + Array iter_values; + Array indices; + ICHECK_EQ(inverse->initial_indices.size(), new_buffer->shape.size()); + for (size_t i = 0; i < inverse->initial_indices.size(); i++) { + const auto& loop_var = inverse->initial_indices[i]; + const auto& dim = new_buffer->shape[i]; + Var block_var("v_" + loop_var->name_hint, loop_var->dtype); + IterVar iter_var(Range(0, dim), block_var, kDataPar); + indices.push_back(iter_var->var); + iter_vars.push_back(iter_var); + iter_values.push_back(loop_var); + } + + PrimExpr pad_value_at_index = pad_value.value()->MapIndices(indices)[0]; + Stmt stmt = BufferStore(new_buffer, pad_value_at_index, indices); + + std::stringstream block_name; + block_name << "buffer_" << new_buffer->name << "_padding"; + auto write_region = BufferRegion::FromPoint(new_buffer, indices); + stmt = BlockRealize(iter_values, padding_predicate, + Block(iter_vars, {}, {write_region}, block_name.str(), stmt)); + + ICHECK_EQ(inverse->initial_indices.size(), new_buffer->shape.size()); + for (size_t rev_i = 0; rev_i < inverse->initial_indices.size(); rev_i++) { + size_t i = (inverse->initial_indices.size() - 1) - rev_i; + Var loop_var = inverse->initial_indices[i]; + PrimExpr extent = new_buffer->shape[i]; + stmt = For(loop_var, 0, extent, ForKind::kSerial, stmt); + } + + const auto& info = write_info_.back(); + Stmt insert_after = [&]() -> Stmt { + if (info.dependent_loopnest.size()) { + return info.dependent_loopnest.front(); + } else if (info.innermost_block_realize) { + return info.innermost_block_realize.value(); + } else { + LOG(FATAL) << "Write occured outside of any block/loop"; + return Stmt(); + } + }(); + return EpiloguePlan{insert_after, stmt}; + } + + struct BindLoopVar { + BindLoopVar(TransformLayoutPlanner* self, For for_node) + : self_(self), var_(for_node->loop_var) { + size_t loop_depth = self_->active_loops_.size(); + self_->loop_depth_lookup_[var_.get()] = {loop_depth, loop_depth}; + self_->active_loops_.push_back(std::move(for_node)); + } + ~BindLoopVar() { + self_->active_loops_.pop_back(); + self_->loop_depth_lookup_.erase(var_.get()); + } + BindLoopVar(const BindLoopVar&) = delete; + BindLoopVar& operator=(const BindLoopVar&) = delete; + BindLoopVar(BindLoopVar&&) = delete; + BindLoopVar& operator=(BindLoopVar&&) = delete; + + TransformLayoutPlanner* self_{nullptr}; + Var var_; + }; + + struct BindVariableDefinition { + BindVariableDefinition() {} + BindVariableDefinition(TransformLayoutPlanner* self, Var var, PrimExpr value) + : self_(self), var_(var) { + if (auto loop_depth = self->LoopDependencyRange(value); loop_depth.has_value()) { + self_->loop_depth_lookup_[var_.get()] = loop_depth.value(); + self_->active_var_bindings_[var_.get()] = Substitute(value, self_->active_var_bindings_); + } + } + ~BindVariableDefinition() { + if (self_) { + self_->loop_depth_lookup_.erase(var_.get()); + self_->active_var_bindings_.erase(var_.get()); + } + } + BindVariableDefinition(const BindVariableDefinition&) = delete; + BindVariableDefinition& operator=(const BindVariableDefinition&) = delete; + BindVariableDefinition(BindVariableDefinition&& other) : BindVariableDefinition() { + swap(other); + } + BindVariableDefinition& operator=(BindVariableDefinition&& other) { + swap(other); + return *this; + } + void swap(BindVariableDefinition& other) { + std::swap(self_, other.self_); + std::swap(var_, other.var_); + } + + TransformLayoutPlanner* self_{nullptr}; + Var var_; + }; + + struct BindBlockRealize { + BindBlockRealize(TransformLayoutPlanner* self, BlockRealize block_realize) : self_(self) { + ICHECK_EQ(block_realize->iter_values.size(), block_realize->block->iter_vars.size()); + for (size_t i = 0; i < block_realize->iter_values.size(); i++) { + bound_vars_.emplace_back(self, block_realize->block->iter_vars[i]->var, + block_realize->iter_values[i]); + } + cache_ = std::move(block_realize); + std::swap(self_->innermost_block_realize_, cache_); + } + ~BindBlockRealize() { std::swap(self_->innermost_block_realize_, cache_); } + BindBlockRealize(const BindBlockRealize&) = delete; + BindBlockRealize& operator=(const BindBlockRealize&) = delete; + BindBlockRealize(BindBlockRealize&&) = delete; + BindBlockRealize& operator=(BindBlockRealize&&) = delete; + + TransformLayoutPlanner* self_{nullptr}; + Optional cache_; + std::vector bound_vars_; + }; + + struct WriteInfo { + // The BufferStore object + BufferStore store; + + // The block realize that contains the store, if any. + Optional innermost_block_realize; + + // The nested loops whose values contribute to the indices used in + // the store. Not all loop variables in the loopnest need to + // contribute, but the first and last must. + std::vector dependent_loopnest; + + // Whether the padding could be represented as a tir::if_then_else + // node. This requires that the surrounding loop iterators + // iterate over all pre-transformation buffer axes, that there are + // no data dependencies between loop iterations, and that + bool contains_row_major_traversal{false}; + }; + + /*! \brief Collected information about each BufferStore */ + std::vector write_info_; + + /*! \brief The loop iterators surrounding the current node + * + * The outermost loop iterator is `active_loops_.front()`, and the + * innermost loop iterator is `active_loops_.back()`. + * + * Used to fill the `WriteInfo::dependent_loopnest` field. + */ + std::vector active_loops_; + + /*! \brief Lookup for the outer/inner loops + * + * Used to fill the `WriteInfo::dependent_loopnest` field. + */ + std::unordered_map> loop_depth_lookup_; + + /*! \brief The variable mappings that are currently in-scope + * + * Used to determine whether the indices of a BufferStore are a + * row-major traversal, even if they are rebound in let/block + * mappings. + */ + std::unordered_map active_var_bindings_; + + /*! \brief The innermost BlockRealize surrounding the current node + * + * Used to fill the `WriteInfo::innermost_block_realize` field.. + */ + Optional innermost_block_realize_{NullOpt}; + + /*! \brief The buffer to be replaced */ + Buffer old_buffer_; +}; + class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer { public: /*! @@ -33,23 +668,33 @@ class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer { * \return The new AST rooting at the original parent scope and the map from the old block to the * new block */ - static std::pair> Rewrite(const Stmt& scope_stmt, - const Buffer& old_buffer, - const Buffer& new_buffer, - const IndexMap& index_map) { + static std::pair> Rewrite( + const Block& scope_stmt, const Buffer& old_buffer, const Buffer& new_buffer, + const IndexMap& index_map, const IndexMap& inverse, const PrimExpr& padding_predicate, + const Optional& pad_value) { + auto plan = TransformLayoutPlanner::Plan(scope_stmt, old_buffer, new_buffer, index_map, inverse, + padding_predicate, pad_value); + arith::Analyzer analyzer; - TransformLayoutRewriter rewriter(old_buffer, new_buffer, index_map, &analyzer); - Stmt result = rewriter(scope_stmt); + TransformLayoutRewriter rewriter(old_buffer, new_buffer, index_map, plan, &analyzer); + Block result = Downcast(rewriter(scope_stmt)); + if (auto plan_ptr = std::get_if(&plan)) { + auto write_ptr = result.CopyOnWrite(); + write_ptr->body = SeqStmt({plan_ptr->prologue, write_ptr->body}); + } return {result, rewriter.block_sref_reuse_}; } private: TransformLayoutRewriter(const Buffer& old_buffer, const Buffer& new_buffer, - const IndexMap& index_map, arith::Analyzer* analyzer) + const IndexMap& index_map, + const TransformLayoutPlanner::TransformPlan& plan, + arith::Analyzer* analyzer) : IRMutatorWithAnalyzer(analyzer), old_buffer_(old_buffer), new_buffer_(new_buffer), index_map_(index_map), + plan_(plan), buffer_data_to_buffer_{{new_buffer->data, new_buffer}} {} void RewriteBufferAccess(Buffer* buffer, Array* indices) { @@ -61,6 +706,31 @@ class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer { using Parent::VisitExpr_; using Parent::VisitStmt_; + Stmt VisitStmt(const Stmt& stmt) final { + Stmt output = Parent::VisitStmt(stmt); + if (auto plan_ptr = std::get_if(&plan_)) { + if (plan_ptr->insert_after.same_as(stmt)) { + return SeqStmt({output, plan_ptr->new_block}); + } + } + return output; + } + + Stmt VisitStmt_(const ForNode* op) final { + // Some replacements may include the original string, such as + // replacing `loop` with `{loop, post_proc}`. In this case, avoid + // infinite recursion. + + For node = GetRef(op); + if (auto plan_ptr = std::get_if(&plan_)) { + auto it = plan_ptr->replacements.find(node); + if (it != plan_ptr->replacements.end()) { + return VisitStmt((*it).second); + } + } + return Parent::VisitStmt_(op); + } + PrimExpr VisitExpr_(const BufferLoadNode* op) final { BufferLoad buffer_load = Downcast(Parent::VisitExpr_(op)); if (buffer_load->buffer.same_as(old_buffer_)) { @@ -97,6 +767,13 @@ class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer { auto* n = block.CopyOnWrite(); RewriteAccessRegion(&n->reads, infered_access_regions[0]); RewriteAccessRegion(&n->writes, infered_access_regions[1]); + n->alloc_buffers.MutateByApply([this](const Buffer& buffer) { + if (buffer.same_as(old_buffer_)) { + return new_buffer_; + } else { + return buffer; + } + }); block_sref_reuse_.Set(GetRef(op), block); return std::move(block); } @@ -104,6 +781,7 @@ class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer { const Buffer& old_buffer_; const Buffer& new_buffer_; const IndexMap& index_map_; + const TransformLayoutPlanner::TransformPlan& plan_; Map buffer_data_to_buffer_; Map block_sref_reuse_; }; @@ -132,8 +810,158 @@ class BufferIsSubregionError : public ScheduleError { Buffer buffer_; }; +class TransformationPaddingIndexMapError : public ScheduleError { + public: + TransformationPaddingIndexMapError(IRModule mod, IndexMap pad_value) + : mod_(mod), pad_value_(pad_value) {} + + String FastErrorString() const final { + std::ostringstream ss; + ss << "ScheduleError: The IndexMap specifying pad_value has " + << pad_value_->final_indices.size() << " outputs, should only have one output"; + return ss.str(); + } + + String DetailRenderTemplate() const final { + std::ostringstream ss; + ss << "ScheduleError: Pad value is specified as " << pad_value_ << " which has " + << pad_value_->final_indices.size() << " outputs, but should only have one output"; + return ss.str(); + } + + IRModule mod() const final { return mod_; } + Array LocationsOfInterest() const final { return {}; } + + private: + IRModule mod_; + IndexMap pad_value_; +}; + +class TransformationPaddingTypeError : public ScheduleError { + public: + TransformationPaddingTypeError(IRModule mod, Buffer buffer, IndexMap pad_value) + : mod_(mod), buffer_(buffer), pad_value_(pad_value) { + ICHECK_EQ(pad_value_->final_indices.size(), 1); + pad_value_dtype_ = pad_value_->final_indices[0].dtype(); + } + + String FastErrorString() const final { + std::ostringstream ss; + ss << "ScheduleError: Type mismatch " << buffer_->dtype << " vs " << pad_value_dtype_; + return ss.str(); + } + + String DetailRenderTemplate() const final { + std::ostringstream ss; + ss << "ScheduleError: Buffer " << buffer_->name << " has elements of type " << buffer_->dtype + << ", but the transformation fills padding with " << pad_value_ << ", which is of type " + << pad_value_dtype_; + return ss.str(); + } + + IRModule mod() const final { return mod_; } + Array LocationsOfInterest() const final { return {}; } + + private: + IRModule mod_; + Buffer buffer_; + IndexMap pad_value_; + DataType pad_value_dtype_; +}; + +class TransformationPaddingExpressionError : public ScheduleError { + public: + static void Check(IRModule mod, Buffer buffer, IndexMap pad_value) { + Visitor visitor(buffer); + ICHECK_EQ(pad_value->final_indices.size(), 1) + << "Internal error: Should be caught by ScheduleError checks prior to this point"; + visitor(pad_value->final_indices[0]); + if (visitor.illegal_load) { + throw TransformationPaddingExpressionError(mod, buffer, pad_value, + visitor.illegal_load.value()); + } + } + + private: + struct Visitor : ExprVisitor { + explicit Visitor(const Buffer& buffer) : buffer_(buffer) {} + + void VisitExpr_(const BufferLoadNode* op) final { + if (!op->buffer.same_as(buffer_)) { + illegal_load = GetRef(op); + } + ExprVisitor::VisitExpr_(op); + } + + const Buffer& buffer_; + Optional illegal_load; + }; + + TransformationPaddingExpressionError(IRModule mod, Buffer buffer, IndexMap pad_value, + BufferLoad illegal_load) + : mod_(mod), buffer_(buffer), pad_value_(pad_value), illegal_load_(illegal_load) {} + + String FastErrorString() const final { + std::ostringstream ss; + ss << "ScheduleError: Pad value may not contain load load from " << illegal_load_->buffer->name; + return ss.str(); + } + + String DetailRenderTemplate() const final { + std::ostringstream ss; + ss << "ScheduleError: Pad value may only contain BufferLoad from the transformed buffer " + << buffer_->name << ", but pad_value " << pad_value_ << " contains expression " + << illegal_load_; + return ss.str(); + } + + IRModule mod() const final { return mod_; } + Array LocationsOfInterest() const final { return {}; } + + IRModule mod_; + Buffer buffer_; + IndexMap pad_value_; + BufferLoad illegal_load_; +}; + +class TransformationIntroducesPaddingError : public ScheduleError { + public: + TransformationIntroducesPaddingError(IRModule mod, Buffer buffer, IndexMap index_map, + PrimExpr padding_predicate) + : mod_(std::move(mod)), + buffer_(std::move(buffer)), + index_map_(std::move(index_map)), + padding_predicate_(std::move(padding_predicate)) {} + + String FastErrorString() const final { + std::ostringstream ss; + ss << "ScheduleError: Transformation would introduce padding at " << padding_predicate_ << "."; + return ss.str(); + } + + String DetailRenderTemplate() const final { + auto new_shape = index_map_->MapShape(buffer_->shape); + std::ostringstream os; + os << "The transformation " << index_map_ << " applied on buffer " << buffer_->name + << " of shape " << buffer_->shape << " would result in shape " << new_shape + << ". However, this would introduce padding wherever " << padding_predicate_ << " is true."; + return os.str(); + } + + IRModule mod() const final { return mod_; } + Array LocationsOfInterest() const final { return {}; } + + private: + IRModule mod_; + Buffer buffer_; + IndexMap index_map_; + PrimExpr padding_predicate_; +}; + void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_index, - BufferIndexType buffer_index_type, const IndexMap& index_map) { + BufferIndexType buffer_index_type, const IndexMap& index_map, + const Optional& pad_value) { + // Step 1: Input handling and error checking const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_sref); Buffer old_buffer = GetNthAccessBuffer(self, GetRef(block_ptr), buffer_index, buffer_index_type); @@ -141,33 +969,48 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_ if (defining_site_sref.defined() && !is_alloc) { throw BufferIsSubregionError(self->mod, old_buffer); } + if (pad_value) { + if (pad_value.value()->final_indices.size() != 1) { + throw TransformationPaddingIndexMapError(self->mod, pad_value.value()); + } + if (pad_value.value()->final_indices[0]->dtype != old_buffer->dtype) { + throw TransformationPaddingTypeError(self->mod, old_buffer, pad_value.value()); + } + + TransformationPaddingExpressionError::Check(self->mod, old_buffer, pad_value.value()); + } StmtSRef scope_sref = defining_site_sref.defined() ? defining_site_sref.value() : GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false); const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_sref); - // Step 1: Infer the shape of the new buffer - ObjectPtr new_buffer_node = make_object(*(old_buffer.get())); - new_buffer_node->shape = index_map->MapShape(old_buffer->shape); - Buffer new_buffer{new_buffer_node}; + auto [inverse, padding_predicate] = [&]() { + Array region; + for (const auto& dim : old_buffer->shape) { + region.push_back(Range::FromMinExtent(0, dim)); + } + return index_map.NonSurjectiveInverse(region); + }(); + + bool has_padding = !is_zero(padding_predicate); + if (has_padding && !pad_value.defined()) { + throw TransformationIntroducesPaddingError(self->mod, old_buffer, index_map, padding_predicate); + } - // Step 2: Rewrite access indices and regions of the buffer - auto [new_stmt, block_sref_reuse] = TransformLayoutRewriter::Rewrite( - GetRef(scope_block), old_buffer, new_buffer, index_map); + // Step 2: Infer the shape of the new buffer + Buffer new_buffer = old_buffer; + new_buffer.CopyOnWrite()->shape = index_map->MapShape(old_buffer->shape); + + // Step 3: Rewrite BufferLoad/BufferStore access indices, block read/write regions, and block + // alloc_buffers. + auto [new_stmt, block_sref_reuse] = + TransformLayoutRewriter::Rewrite(GetRef(scope_block), old_buffer, new_buffer, + index_map, inverse, padding_predicate, pad_value); Block new_scope_block = Downcast(new_stmt); - // Step 3: Rewrite alloc_buffer of the block or buffer_map of the PrimFunc. - if (defining_site_sref.defined()) { - auto* n = new_scope_block.CopyOnWrite(); - n->alloc_buffers.MutateByApply([&old_buffer, &new_buffer](const Buffer& buffer) { - if (buffer.same_as(old_buffer)) { - return new_buffer; - } - return buffer; - }); - block_sref_reuse.Set(GetRef(scope_block), new_scope_block); - } else { + // Step 4: Rewrite buffer_map of the PrimFunc if necessary. + if (!defining_site_sref.defined()) { GlobalVar g_var; GetRootPrimFunc(self->mod, scope_block, &g_var); IRModuleNode* new_mod = self->mod.CopyOnWrite(); @@ -502,17 +1345,20 @@ struct TransformLayoutTraits : public UnpackedInstTraits private: static constexpr size_t kNumInputs = 1; - static constexpr size_t kNumAttrs = 3; + static constexpr size_t kNumAttrs = 4; static constexpr size_t kNumDecisions = 0; static void UnpackedApplyToSchedule(Schedule sch, BlockRV block_rv, Integer buffer_index, - Integer buffer_index_type, IndexMap index_map) { + Integer buffer_index_type, IndexMap index_map, + Optional pad_value) { return sch->TransformLayout(block_rv, buffer_index.IntValue(), - static_cast(buffer_index_type->value), index_map); + static_cast(buffer_index_type->value), index_map, + pad_value); } static String UnpackedAsPython(Array outputs, String block_rv, Integer buffer_index, - Integer buffer_index_type, IndexMap index_map) { + Integer buffer_index_type, IndexMap index_map, + Optional pad_value) { PythonAPICall py("transform_layout"); py.Input("block", block_rv); @@ -522,6 +1368,8 @@ struct TransformLayoutTraits : public UnpackedInstTraits py.Input("buffer", os.str()); py.Input("index_map", index_map->ToPythonString()); + py.Input("pad_value", pad_value ? pad_value.value()->ToPythonString() : "None"); + return py.Str(); } @@ -532,6 +1380,7 @@ struct TransformLayoutTraits : public UnpackedInstTraits attrs_record.push_back(attrs[0]); attrs_record.push_back(attrs[1]); attrs_record.push_back(String(::tvm::SaveJSON(attrs[2]))); + attrs_record.push_back(attrs[3]); return std::move(attrs_record); } @@ -541,6 +1390,7 @@ struct TransformLayoutTraits : public UnpackedInstTraits attrs.push_back(attrs_record[0]); attrs.push_back(attrs_record[1]); attrs.push_back(::tvm::LoadJSON(Downcast(attrs_record[2]))); + attrs.push_back(attrs_record[3]); return attrs; } diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc index d72f67fb7c2d..2f27dbb9fbf1 100644 --- a/src/tir/schedule/schedule.cc +++ b/src/tir/schedule/schedule.cc @@ -248,9 +248,11 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleUnannotate") /******** (FFI) Layout transformation ********/ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleTransformLayout") .set_body_typed([](Schedule self, const BlockRV& block_rv, int buffer_index, - int buffer_index_type, const IndexMap& index_map) { + int buffer_index_type, const IndexMap& index_map, + const Optional& pad_value) { return self->TransformLayout(block_rv, buffer_index, - static_cast(buffer_index_type), index_map); + static_cast(buffer_index_type), index_map, + pad_value); }); TVM_REGISTER_GLOBAL("tir.schedule.ScheduleTransformBlockLayout") .set_body_method(&ScheduleNode::TransformBlockLayout); diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc index a31950d33115..9ff793dc39dd 100644 --- a/src/tir/schedule/traced_schedule.cc +++ b/src/tir/schedule/traced_schedule.cc @@ -487,14 +487,17 @@ void TracedScheduleNode::Unannotate(const BlockRV& block_rv, const String& ann_k void TracedScheduleNode::TransformLayout(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type, - const IndexMap& index_map) { - ConcreteScheduleNode::TransformLayout(block_rv, buffer_index, buffer_index_type, index_map); + const IndexMap& index_map, + const Optional& pad_value) { + ConcreteScheduleNode::TransformLayout(block_rv, buffer_index, buffer_index_type, index_map, + pad_value); static const InstructionKind& kind = InstructionKind::Get("TransformLayout"); trace_->Append( - /*inst=*/Instruction(/*kind=*/kind, - /*inputs=*/{block_rv}, - /*attrs=*/{Integer(buffer_index), Integer(buffer_index_type), index_map}, - /*outputs=*/{})); + /*inst=*/Instruction( + /*kind=*/kind, + /*inputs=*/{block_rv}, + /*attrs=*/{Integer(buffer_index), Integer(buffer_index_type), index_map, pad_value}, + /*outputs=*/{})); } void TracedScheduleNode::TransformBlockLayout(const BlockRV& block_rv, const IndexMap& index_map) { diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h index ad44cc6ae552..0e83b35f44e9 100644 --- a/src/tir/schedule/traced_schedule.h +++ b/src/tir/schedule/traced_schedule.h @@ -103,7 +103,7 @@ class TracedScheduleNode : public ConcreteScheduleNode { void Unannotate(const BlockRV& block_rv, const String& ann_key) override; /******** Schedule: Layout transformation ********/ void TransformLayout(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type, - const IndexMap& index_map) override; + const IndexMap& index_map, const Optional& pad_value) override; void TransformBlockLayout(const BlockRV& block_rv, const IndexMap& index_map) override; void SetAxisSeparator(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type, diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py index 0332df7fd312..8ed350cc4c46 100644 --- a/tests/python/unittest/test_tir_schedule_transform_layout.py +++ b/tests/python/unittest/test_tir_schedule_transform_layout.py @@ -329,5 +329,415 @@ def test_transform_block_layout_fail_mixed_iter_type(use_block_name): ) +class BasePaddingCompare(tvm.testing.CompareBeforeAfter): + pad_value = tvm.testing.parameter(None) + + transformed_buffer = tvm.testing.parameter("A") + + @pytest.fixture + def transform(self, pad_value, transformed_buffer): + def transform(mod): + sch = tir.Schedule(mod) + sch.transform_layout( + "block", transformed_buffer, lambda i: [i // 4, i % 4], pad_value=pad_value + ) + return sch.mod + + return transform + + +class TestNoPadding(BasePaddingCompare): + """Transformations without padding do not depend on pad_value.""" + + pad_value = tvm.testing.parameter(None, 42) + + def before(): + A = T.alloc_buffer(16, "int32") + for i in T.serial(16): + with T.block("block"): + vi = T.axis.remap("S", [i]) + A[vi] = 0 + + def expected(): + A = T.alloc_buffer([4, 4], "int32") + for i in T.serial(16): + with T.block("block"): + vi = T.axis.remap("S", [i]) + A[vi // 4, vi % 4] = 0 + + +class TestNoPaddingMultipleUsage(BasePaddingCompare): + """Transformations without padding do not depend on pad_value. + + Like TestNoPadding, but the buffer A shows up in multiple + locations. To remain internally consistent, all instances of the + buffer should be rewritten. + """ + + pad_value = tvm.testing.parameter(None, 42) + + def before(): + A = T.alloc_buffer(16, "int32") + for i in T.serial(16): + with T.block("block"): + vi = T.axis.remap("S", [i]) + A[vi] = 0 + + B = T.alloc_buffer(16, "int32") + for i in T.serial(16): + with T.block("other"): + vi = T.axis.remap("S", [i]) + B[vi] = A[vi] + + def expected(): + A = T.alloc_buffer([4, 4], "int32") + for i in T.serial(16): + with T.block("block"): + vi = T.axis.remap("S", [i]) + A[vi // 4, vi % 4] = 0 + + B = T.alloc_buffer(16, "int32") + for i in T.serial(16): + with T.block("other"): + vi = T.axis.remap("S", [i]) + B[vi] = A[vi // 4, vi % 4] + + +class TestNoPaddingOpaqueBlock(BasePaddingCompare): + """Transformations without padding do not depend on pad_value. + + Like TestNoPadding, but buffer access is done in an opaque block. + """ + + pad_value = tvm.testing.parameter(None, 42) + + def before(): + A = T.alloc_buffer(16, "int32") + for i in T.serial(16): + with T.block("block"): + A[i] = 0 + + def expected(): + A = T.alloc_buffer([4, 4], "int32") + for i in T.serial(16): + with T.block("block"): + A[i // 4, i % 4] = 0 + + +class TestErrorIfPaddingForbidden(BasePaddingCompare): + """Unless padding is explicitly enabled, should raise error""" + + def before(): + A = T.alloc_buffer(14, "int32") + for i in T.serial(14): + with T.block("block"): + vi = T.axis.remap("S", [i]) + A[vi] = 0 + + expected = tvm.tir.schedule.schedule.ScheduleError + + +class TestErrorOnWrongPaddingType(BasePaddingCompare): + """The padding must have the same dtype as the buffer""" + + pad_value = tvm.testing.parameter(0.5) + + def before(): + A = T.alloc_buffer(14, "int32") + for i in T.serial(14): + with T.block("block"): + vi = T.axis.remap("S", [i]) + A[vi] = 0 + + expected = tvm.tir.schedule.schedule.ScheduleError + + +class TestPaddedTransformIfThenElse(BasePaddingCompare): + """Use if_then_else to represent padding, if possible. + + For a block that is a producer of the pre-transformation buffer, + which visits all indices according to a row-major traversal, and + which has no effect other than producing the transformed buffer, + transform the loop iterators to be a row-major traversal of the + post-transformation buffer, with padding represented by + `T.if_then_else`. + """ + + pad_value = tvm.testing.parameter(0) + transformed_buffer = tvm.testing.parameter("B") + + def before(A: T.Buffer[14, "int32"]): + B = T.alloc_buffer(14, "int32") + for i in T.serial(14): + with T.block("block"): + vi = T.axis.remap("S", [i]) + B[vi] = A[vi] + + def expected(A: T.Buffer[14, "int32"]): + B = T.alloc_buffer([4, 4], "int32") + for i, j in T.grid(4, 4): + with T.block("block"): + vi, vj = T.axis.remap("SS", [i, j]) + B[vi, vj] = T.if_then_else(vi == 3 and 2 <= vj, 0, A[vi * 4 + vj], dtype="int32") + + +class TestPaddedTransformWithoutLoop(BasePaddingCompare): + """Handle padded writes without a loop + + The statement being replaced may be something other than a + for-loop, such as if a loop has already been unrolled. + """ + + pad_value = tvm.testing.parameter(0) + + def before(A: T.Buffer[14, "int32"]): + with T.block("root"): + T.reads() + T.writes() + with T.block("block"): + A[0] = 0 + + def expected(A: T.Buffer[(4, 4), "int32"]): + with T.block("block"): + A[0, 0] = 0 + + for i, j in T.grid(4, 4): + with T.block("buffer_A_padding"): + vi, vj = T.axis.remap("SS", [i, j]) + T.where(i == 3 and 2 <= j) + A[vi, vj] = 0 + + +class TestPaddedTransformIfThenElseReduction(BasePaddingCompare): + """Like TestPaddedTransformIfThenElse, but with a reduction axis""" + + pad_value = tvm.testing.parameter(0) + transformed_buffer = tvm.testing.parameter("B") + + def before(A: T.Buffer[(14, 32), "int32"]): + B = T.alloc_buffer(14, "int32") + for i, k in T.grid(14, 32): + with T.block("block"): + vi, vk = T.axis.remap("SR", [i, k]) + with T.init(): + B[vi] = 0 + B[vi] = B[vi] + A[vi, vk] + + def expected(A: T.Buffer[(14, 32), "int32"]): + B = T.alloc_buffer([4, 4], "int32") + for i, j, k in T.grid(4, 4, 32): + with T.block("block"): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + B[vi, vj] = T.if_then_else(vi == 3 and 2 <= vj, 0, 0, dtype="int32") + B[vi, vj] = T.if_then_else( + vi == 3 and 2 <= vj, 0, B[vi, vj] + A[vi * 4 + vj, vk], dtype="int32" + ) + + +class TestPaddedTransformIfThenElseReductionOpaque(BasePaddingCompare): + """Like TestPaddedTransformIfThenElseReduction, but with opaque blocks""" + + pad_value = tvm.testing.parameter(0) + transformed_buffer = tvm.testing.parameter("B") + + def before(A: T.Buffer[(14, 32), "int32"]): + B = T.alloc_buffer(14, "int32") + for i in T.serial(14): + B[i] = 0 + for k in T.serial(32): + with T.block("block"): + B[i] = B[i] + A[i, k] + + def expected(A: T.Buffer[(14, 32), "int32"]): + B = T.alloc_buffer([4, 4], "int32") + for i, j in T.grid(4, 4): + B[i, j] = T.if_then_else(i == 3 and 2 <= j, 0, 0, dtype="int32") + for k in T.serial(32): + with T.block("block"): + B[i, j] = T.if_then_else( + i == 3 and 2 <= j, 0, B[i, j] + A[i * 4 + j, k], dtype="int32" + ) + + +class TestPaddedTransformPostProcIfRequiredDueToSideEffects(BasePaddingCompare): + """Set the transformation padding in a post-processing block. + + Like TestPaddedTransformIfThenElse, but the block that produces B + also has the effect of setting `C`. + """ + + pad_value = tvm.testing.parameter(0) + transformed_buffer = tvm.testing.parameter("B") + + def before(A: T.Buffer[14, "int32"]): + B = T.alloc_buffer(14, "int32") + C = T.alloc_buffer(14, "int32") + for i in T.serial(14): + with T.block("block"): + vi = T.axis.remap("S", [i]) + B[vi] = A[vi] + C[vi] = 0 + + def expected(A: T.Buffer[14, "int32"]): + B = T.alloc_buffer([4, 4], "int32") + C = T.alloc_buffer(14, "int32") + for i in T.serial(14): + with T.block("block"): + vi = T.axis.remap("S", [i]) + B[vi // 4, vi % 4] = A[vi] + C[vi] = 0 + + for i, j in T.grid(4, 4): + with T.block("block_pad_B"): + vi, vj = T.axis.remap("SS", [i, j]) + T.where(i == 3 and 2 <= j) + B[vi, vj] = 0 + + +class TestPaddedTransformOfInputCreatesAssumption(BasePaddingCompare): + """Transformation of an input buffer places T.assume locally""" + + pad_value = tvm.testing.parameter(42) + + def before(A: T.Buffer[14, "int32"], B: T.Buffer[14, "int32"]): + for i in T.serial(14): + with T.block("block"): + vi = T.axis.remap("S", [i]) + B[vi] = A[vi] + + def expected(A: T.Buffer[(4, 4), "int32"], B: T.Buffer[14, "int32"]): + for i, j in T.grid(4, 4): + with T.block("buffer_A_assumption"): + vi, vj = T.axis.remap("SS", [i, j]) + T.assume(not (vi == 3 and 2 <= vj) or A[vi, vj] == 42) + + for i in T.serial(14): + with T.block("block"): + vi = T.axis.remap("S", [i]) + B[vi] = A[vi // 4, vi % 4] + + +class TestPaddedTransformNonConstantValue(tvm.testing.CompareBeforeAfter): + """Allow an expression to specify the pad value. + + Like TestPaddedTransformIfThenElse, but the pad value depends on + the indices. + """ + + @pytest.fixture + def transform(self): + def transform(mod): + sch = tir.Schedule(mod) + sch.transform_layout( + "block", + "B", + lambda i: [i // 4, i % 4], + pad_value=lambda i, j: i + j, + ) + return sch.mod + + return transform + + def before(A: T.Buffer[14, "int32"]): + B = T.alloc_buffer(14, "int32") + for i in T.serial(14): + with T.block("block"): + vi = T.axis.remap("S", [i]) + B[vi] = A[vi] + + def expected(A: T.Buffer[14, "int32"]): + B = T.alloc_buffer([4, 4], "int32") + for i, j in T.grid(4, 4): + with T.block("block"): + vi, vj = T.axis.remap("SS", [i, j]) + B[vi, vj] = T.if_then_else( + vi == 3 and 2 <= vj, vi + vj, A[vi * 4 + vj], dtype="int32" + ) + + +@pytest.mark.xfail(reason="Not yet implemented") +class TestPaddedTransformRepeatedBufferElement(tvm.testing.CompareBeforeAfter): + """Allow an expression to specify the pad value. + + Like TestPaddedTransformOfInputCreatesAssumption, but the pad + value depends on another portion of the buffer. In this case, the + padding at the end of A contains repeated elements from the + beginning of A. + """ + + @pytest.fixture + def transform(self): + def transform(mod): + sch = tir.Schedule(mod) + + A = sch.get(sch.get_block("block")).reads[0].buffer + sch.transform_layout( + "block", + "A", + lambda i: [i // 4, i % 4], + pad_value=lambda i, j: A[(4 * i + j) % 14], + ) + return sch.mod + + return transform + + def before(A: T.Buffer[14, "int32"]): + B = T.alloc_buffer(14, "int32") + for i in T.serial(14): + with T.block("block"): + vi = T.axis.remap("S", [i]) + B[vi] = A[vi] + + def expected(A: T.Buffer[(4, 4), "int32"]): + for i, j in T.grid(4, 4): + with T.block("buffer_A_assumption"): + vi, vj = T.axis.remap("SS", [i, j]) + T.assume( + not (vi == 3 and 2 <= vj) + or A[vi, vj] == A[((4 * vi + j) % 14) // 4, ((4 * vi + j) % 14) % 4] + ) + + B = T.alloc_buffer(14, "int32") + for i in T.grid(14): + with T.block("block"): + vi = T.axis.remap("S", [i]) + B[vi] = A[vi // 4, vi % 4] + + +class TestPadValueMayNotReferenceOtherBuffer(tvm.testing.CompareBeforeAfter): + """Allow an expression to specify the pad value. + + Like TestPaddedTransformRepeatedBufferElement, but the pad value depends on + a different buffer, which is not allowed. + """ + + @pytest.fixture + def transform(self): + def transform(mod): + sch = tir.Schedule(mod) + + A = sch.get(sch.get_block("block")).reads[0].buffer + other = tir.decl_buffer(1, A.dtype, name="other") + sch.transform_layout( + "block", + "A", + lambda i: [i // 4, i % 4], + pad_value=lambda i, j: other[0], + ) + return sch.mod + + return transform + + def before(A: T.Buffer[14, "int32"]): + B = T.alloc_buffer(14, "int32") + for i in T.serial(14): + with T.block("block"): + vi = T.axis.remap("S", [i]) + B[vi] = A[vi] + + expected = tvm.tir.schedule.schedule.ScheduleError + + if __name__ == "__main__": tvm.testing.main() From e30ac71bdecea3625c150a49591c886e60a48479 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Mon, 19 Sep 2022 11:50:20 -0500 Subject: [PATCH 205/704] [Arith][TIR] IntSetAnalyzer, delay intersection of IntSet until use (#12821) Follow-up from https://github.com/apache/tvm/pull/11970, to improve performance. In the initial implementation, the `analyzer->int_set` would compute the intersection of all scope-based constraints when entering the scope, even if they weren't actually used. This commit delays the call to `Intersect` until required, following the same behavior as `ConstIntBound`. --- src/arith/int_set.cc | 126 ++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 74 deletions(-) diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc index 35b12bb35238..7d601d9a8bae 100644 --- a/src/arith/int_set.cc +++ b/src/arith/int_set.cc @@ -362,8 +362,13 @@ using namespace tir; // We might use better set analysis in the future to replace the intervalset. class IntervalSetEvaluator : public ExprFunctor { public: - IntervalSetEvaluator(Analyzer* analyzer, const Map& dom_map, bool eval_vec = false) - : analyzer_(analyzer), dom_map_(dom_map), eval_vec_(eval_vec) {} + IntervalSetEvaluator(Analyzer* analyzer, const Map& dom_map, + const std::vector>* dom_constraints = nullptr, + bool eval_vec = false) + : analyzer_(analyzer), + dom_map_(dom_map), + dom_constraints_(dom_constraints), + eval_vec_(eval_vec) {} IntervalSet Eval(const PrimExpr& val) { return this->VisitExpr(val); } // evaluate and relax the set @@ -383,18 +388,40 @@ class IntervalSetEvaluator : public ExprFunctor { IntervalSet VisitExpr_(const VarNode* op) final { Var var = GetRef(op); + + Array values; + if (dom_constraints_) { + for (const auto& constraint : *dom_constraints_) { + if (var.same_as(constraint.first)) { + values.push_back(constraint.second); + } + } + } + auto it = dom_map_.find(var); if (it != dom_map_.end()) { - IntervalSet res = ToIntervalSet((*it).second); - if (res->min_value.same_as(var) && res->max_value.same_as(var)) { - return res; - } - // recursively evaluate mapped result - // in case the domain contains variables to be relaxed. - return Eval(res); - } else { + values.push_back((*it).second); + } + + if (values.empty()) { return IntervalSet::SinglePoint(var); } + + IntSet intersection = [&]() { + if (values.size() == 1) { + return values.front(); + } else { + return Intersect(values); + } + }(); + + IntervalSet res = ToIntervalSet(intersection); + if (res->min_value.same_as(var) && res->max_value.same_as(var)) { + return res; + } + // recursively evaluate mapped result + // in case the domain contains variables to be relaxed. + return Eval(res); } IntervalSet VisitExpr_(const AddNode* op) final { return VisitBinaryExpr_(op); } @@ -517,6 +544,7 @@ class IntervalSetEvaluator : public ExprFunctor { // analyzer Analyzer* analyzer_; const Map& dom_map_; + const std::vector>* dom_constraints_; bool eval_vec_{false}; }; @@ -529,7 +557,7 @@ class IntSetAnalyzer::Impl { } IntSet Eval(const PrimExpr& expr) const { - return IntervalSetEvaluator(analyzer_, GetCurrentBounds(), true).Eval(expr); + return IntervalSetEvaluator(analyzer_, dom_map_, &dom_constraints_, true).Eval(expr); } void Bind(const Var& var, const Range& range, bool allow_override) { @@ -541,10 +569,6 @@ class IntSetAnalyzer::Impl { std::function EnterConstraint(const PrimExpr& constraint); private: - // Get the current variable bounds, including both global bounds and - // scope-dependent bounds. - Map GetCurrentBounds() const; - // Utility function to split a boolean condition into the domain // bounds implied by that condition. static std::vector> DetectBoundInfo(const PrimExpr& cond); @@ -556,9 +580,11 @@ class IntSetAnalyzer::Impl { // ranges) Map dom_map_; - // Map of variables to implicit scope-dependent bounds (e.g. inside - // the body of an if-statement) - Map constraints_; + // List of implicit scope-dependent bounds (e.g. inside the body of + // an if-statement). Maintained as a list of constraints, rather + // than as a `Map`, to avoid computing an Intersection + // until required. + std::vector> dom_constraints_; }; IntSetAnalyzer::IntSetAnalyzer(Analyzer* parent) : impl_(new Impl(parent)) {} @@ -603,29 +629,6 @@ void IntSetAnalyzer::Impl::Bind(const Var& var, const PrimExpr& expr, bool can_o Update(var, Eval(expr), can_override); } -Map IntSetAnalyzer::Impl::GetCurrentBounds() const { - // If either constraints_ or dom_map_ is empty, return the other to - // avoid constructing a new map. - if (constraints_.empty()) { - return dom_map_; - } else if (dom_map_.empty()) { - return constraints_; - } - - // If neither is empty, construct a merged domain map with - // information from both sources. - Map merged = dom_map_; - for (const auto& pair : constraints_) { - auto it = merged.find(pair.first); - if (it == merged.end()) { - merged.Set(pair.first, pair.second); - } else { - merged.Set(pair.first, Intersect({pair.second, (*it).second})); - } - } - return merged; -} - std::vector> IntSetAnalyzer::Impl::DetectBoundInfo( const PrimExpr& constraint) { PVar x; @@ -665,41 +668,16 @@ std::function IntSetAnalyzer::EnterConstraint(const PrimExpr& constraint } std::function IntSetAnalyzer::Impl::EnterConstraint(const PrimExpr& constraint) { - Map cached_values; - auto bounds = DetectBoundInfo(constraint); if (bounds.size() == 0) return nullptr; - // Collect the current values of each var that is changes by this - // constraint. - for (const auto& pair : bounds) { - auto it = constraints_.find(pair.first); - if (it == constraints_.end()) { - cached_values.Set(pair.first, IntSet()); - } else { - cached_values.Set(pair.first, (*it).second); - } - } - - // Update all constraints - for (const auto& pair : bounds) { - auto it = constraints_.find(pair.first); - if (it == constraints_.end()) { - constraints_.Set(pair.first, pair.second); - } else { - constraints_.Set(pair.first, Intersect({pair.second, (*it).second})); - } - } - - auto frecover = [cached_values, this]() { - for (const auto& it : cached_values) { - if (it.second.defined()) { - constraints_.Set(it.first, it.second); - } else { - constraints_.erase(it.first); - } - } + size_t old_size = dom_constraints_.size(); + dom_constraints_.insert(dom_constraints_.end(), bounds.begin(), bounds.end()); + size_t new_size = dom_constraints_.size(); + auto frecover = [old_size, new_size, this]() { + ICHECK_EQ(dom_constraints_.size(), new_size); + dom_constraints_.resize(old_size); }; return frecover; } @@ -960,13 +938,13 @@ Map ConvertDomMap(const std::unordered_map& IntSet EvalSet(PrimExpr e, const Map& dom_map) { Analyzer ana; - return IntervalSetEvaluator(&ana, dom_map, false).Eval(e); + return IntervalSetEvaluator(&ana, dom_map, {}, false).Eval(e); } IntSet IntSet::Vector(PrimExpr x) { Analyzer ana; Map dmap; - return IntervalSetEvaluator(&ana, dmap, true).Eval(x); + return IntervalSetEvaluator(&ana, dmap, {}, true).Eval(x); } IntSet EvalSet(PrimExpr e, const Map& dom_map) { From da7f65d9d152397f8f7e73b21c6310f976e64bfd Mon Sep 17 00:00:00 2001 From: Noah Verke Date: Mon, 19 Sep 2022 10:01:29 -0700 Subject: [PATCH 206/704] [Hexagon] Create test examples to show parallelization (#12654) * [Hexagon] Create test examples to show parallelization working on Hexagon workloads. * Increase max size of tvm_rpc_android buffer size. * Reformat tests to be parameterized. * Comment out tests to speedup CI. --- .../contrib/test_hexagon/test_parallel_hvx.py | 230 ++++++++++++++++++ .../test_hexagon/test_parallel_scalar.py | 159 ++++++++++++ 2 files changed, 389 insertions(+) create mode 100644 tests/python/contrib/test_hexagon/test_parallel_hvx.py create mode 100644 tests/python/contrib/test_hexagon/test_parallel_scalar.py diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx.py b/tests/python/contrib/test_hexagon/test_parallel_hvx.py new file mode 100644 index 000000000000..a34f5b8e261b --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_parallel_hvx.py @@ -0,0 +1,230 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Test parallelizing HVX workloads and compare them to single thread examples. +""" +import numpy as np +import tvm + +from tvm.script import tir as T +from numpy.random import default_rng + +TEST_OUTPUT_TEMPLATE = "Test {} with {} operations... \n -Single Thread: {} ms \n -Parallel: {} ms\n -Speedup: {}x\n" + + +def get_vrmpy_shape_dtypes(operations): + return ((operations, 128), "uint8", (operations, 128), "uint8", (operations, 32), "int32") + + +def get_vmpy_vadd_shape_dtype(operations): + return ((operations, 128), "uint8", (operations, 128), "uint8", (operations, 128), "int16") + + +def vmpy_expected_producer(shape, a, b): + expected = np.zeros(shape, dtype="int16") + for n in range(shape[0]): + for i in range(0, 128, 2): + expected[n, i // 2] = np.int16(a[n, i]) * np.int16(b[n, i]) + for i in range(1, 128, 2): + expected[n, i // 2 + 64] = np.int16(a[n, i]) * np.int16(b[n, i]) + return expected + + +def vadd_expected_producer(shape, a, b): + expected = np.zeros(shape, dtype="int16") + for n in range(shape[0]): + for i in range(0, 128, 2): + expected[n, i // 2] = np.int16(a[n, i]) + np.int16(b[n, i]) + for i in range(1, 128, 2): + expected[n, i // 2 + 64] = np.int16(a[n, i]) + np.int16(b[n, i]) + return expected + + +def vrmpy_expected_producer(shape, a, b): + expected = np.zeros(shape, dtype="int32") + for n in range(shape[0]): + for i in range(32): + for r in range(4): + expected[n, i] = expected[n, i] + np.uint32(a[n, i * 4 + r]) * np.uint32( + b[n, i * 4 + r] + ) + return expected + + +def get_vmpy_operator(operations): + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [operations, 128], dtype="uint8") + B = T.match_buffer(b, [operations, 128], dtype="uint8") + C = T.match_buffer(c, [operations, 128], dtype="int16") + for n in T.grid(operations): + with T.block("C"): + vn = T.axis.remap("S", [n]) + C[vn, T.ramp(0, 1, 128)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vmpybusv.128B"), + T.uint32(2), + T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + dtype="int16x128", + ) + + return operator + + +def get_vadd_operator(operations): + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [operations, 128], dtype="uint8") + B = T.match_buffer(b, [operations, 128], dtype="uint8") + C = T.match_buffer(c, [operations, 128], dtype="int16") + for n in T.grid(operations): + with T.block("C"): + vn = T.axis.remap("S", [n]) + C[vn, T.ramp(0, 1, 128)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vaddubh.128B"), + T.uint32(2), + T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + dtype="int16x128", + ) + + return operator + + +def get_vrmpy_operator(operations): + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [operations, 128], dtype="uint8") + B = T.match_buffer(b, [operations, 128], dtype="uint8") + C = T.match_buffer(c, [operations, 32], dtype="int32") + for n in T.grid(operations): + with T.block("C"): + vn = T.axis.remap("S", [n]) + C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), + T.uint32(2), + T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + dtype="int32x32", + ) + + return operator + + +def evaluate(hexagon_session, shape_dtypes, expected_output_producer, sch): + a_shape, a_dtype, b_shape, b_dtype, c_shape, c_dtype = shape_dtypes + + target_hexagon = tvm.target.hexagon("v68") + func_tir = tvm.build( + sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon) + ) + module = hexagon_session.load_module(func_tir) + + rng = default_rng() + a = rng.integers(0, 16, a_shape, dtype=a_dtype) + b = rng.integers(0, 16, b_shape, dtype=b_dtype) + c = np.zeros(c_shape, dtype=c_dtype) + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device) + b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device) + c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device) + + # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise. + number = 1 + repeat = 1 + + timer = module.time_evaluator( + "__tvm_main__", hexagon_session.device, number=number, repeat=repeat + ) + runtime = timer(a_hexagon, b_hexagon, c_hexagon) + tvm.testing.assert_allclose(c_hexagon.asnumpy(), expected_output_producer(c_shape, a, b)) + + return round(runtime.mean * 1000, 6) + + +class TestMatMulVec: + + ( + operation_name, + operator_producer, + shape_dtypes_producer, + expected_output_producer, + ) = tvm.testing.parameters( + ("vrmpy", get_vrmpy_operator, get_vrmpy_shape_dtypes, vrmpy_expected_producer), + ("vmpy", get_vmpy_operator, get_vmpy_vadd_shape_dtype, vmpy_expected_producer), + ("vadd", get_vadd_operator, get_vmpy_vadd_shape_dtype, vadd_expected_producer), + ) + + # Experimentally best split factor but all multiples of 4 perform pretty well. + # This is because there are 4 HVX untis available on the device and pipelining + # works best with parallels of the number of available HVX. + split_factor = tvm.testing.parameter(4) + + # Removed most of these to speedup CI. + operation_count = tvm.testing.parameter( + 128, + # 256, + # 512, + # 1024, # Single thread runs faster since L2 cache can handle the entire request quickly + # 2048, + # 4096, # Significant performance degredation once the inputs and outputs cannot all fit in L2 + # 8192, + # 16384, + ) + + @tvm.testing.requires_hexagon + def test( + self, + hexagon_session, + operation_count, + operation_name, + operator_producer, + shape_dtypes_producer, + expected_output_producer, + split_factor, + ): + + sch = tvm.tir.Schedule(operator_producer(operation_count)) + single_thread_runtime = evaluate( + hexagon_session, shape_dtypes_producer(operation_count), expected_output_producer, sch + ) + + sch = tvm.tir.Schedule(operator_producer(operation_count)) + block = sch.get_block("C") + b = sch.get_loops(block) + bo, _ = sch.split(b[0], factors=[split_factor, None]) + sch.parallel(bo) + + parallel_runtime = evaluate( + hexagon_session, shape_dtypes_producer(operation_count), expected_output_producer, sch + ) + + speedup = round(single_thread_runtime / parallel_runtime, 2) + + print( + TEST_OUTPUT_TEMPLATE.format( + operation_name, operation_count, single_thread_runtime, parallel_runtime, speedup + ) + ) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_parallel_scalar.py b/tests/python/contrib/test_hexagon/test_parallel_scalar.py new file mode 100644 index 000000000000..b3d07ae978ba --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_parallel_scalar.py @@ -0,0 +1,159 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" Test parallelism for multiple different scalar workloads. """ + +import numpy as np +import tvm + +from tvm.script import tir as T +from numpy.random import default_rng + +TEST_OUTPUT_TEMPLATE = "Test {} with {} operations... \n -Single Thread: {} ms \n -Parallel: {} ms\n -Speedup: {}x\n" + + +def get_add_operator(operations): + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [operations], dtype="float64") + B = T.match_buffer(b, [operations], dtype="float64") + C = T.match_buffer(c, [operations], dtype="float64") + for n in T.grid(operations): + with T.block("C"): + vn = T.axis.remap("S", [n]) + C[vn] = A[vn] + B[vn] + + return operator + + +def get_multiply_operator(operations): + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [operations], dtype="float64") + B = T.match_buffer(b, [operations], dtype="float64") + C = T.match_buffer(c, [operations], dtype="float64") + for n in T.grid(operations): + with T.block("C"): + vn = T.axis.remap("S", [n]) + C[vn] = A[vn] * B[vn] + + return operator + + +def get_sub_operator(operations): + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [operations], dtype="float64") + B = T.match_buffer(b, [operations], dtype="float64") + C = T.match_buffer(c, [operations], dtype="float64") + for n in T.grid(operations): + with T.block("C"): + vn = T.axis.remap("S", [n]) + C[vn] = A[vn] - B[vn] + + return operator + + +def evaluate(hexagon_session, operations, expected, sch): + shape = operations + dtype = "float64" + + target_hexagon = tvm.target.hexagon("v68") + func_tir = tvm.build( + sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon) + ) + module = hexagon_session.load_module(func_tir) + + rng = default_rng() + a = rng.random(shape, dtype=dtype) + b = rng.random(shape, dtype=dtype) + c = np.zeros(shape, dtype=dtype) + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device) + b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device) + c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device) + + # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise. + number = 1 + repeat = 1 + + timer = module.time_evaluator( + "__tvm_main__", hexagon_session.device, number=number, repeat=repeat + ) + runtime = timer(a_hexagon, b_hexagon, c_hexagon) + + tvm.testing.assert_allclose(c_hexagon.asnumpy(), expected(a, b)) + + return round(runtime.mean * 1000, 6) + + +class TestMatMulVec: + + (operation_name, operator_producer, expected_output_producer,) = tvm.testing.parameters( + ("add", get_add_operator, (lambda a, b: a + b)), + ("mul", get_multiply_operator, (lambda a, b: a * b)), + ("sub", get_sub_operator, (lambda a, b: a - b)), + ) + + # Removed most of these to speedup CI. + operations = tvm.testing.parameter( + 128, + # 256, + # 512, + # 1024, # Single thread runs faster since L2 cache can handle the entire request quickly + # 2048, + # 4096, # Significant performance degredation once the inputs and outputs cannot all fit in L2 + # 8192, + # 16384, + ) + + split_factor = tvm.testing.parameter(4) + + @tvm.testing.requires_hexagon + def test_add( + self, + hexagon_session, + operation_name, + operator_producer, + expected_output_producer, + operations, + split_factor, + ): + + sch = tvm.tir.Schedule(operator_producer(operations)) + single_thread_runtime = evaluate(hexagon_session, operations, expected_output_producer, sch) + + sch = tvm.tir.Schedule(operator_producer(operations)) + block = sch.get_block("C") + b = sch.get_loops(block) + bo, _ = sch.split(b[0], factors=[split_factor, None]) + sch.parallel(bo) + parallel_runtime = evaluate(hexagon_session, operations, expected_output_producer, sch) + + speedup = round(single_thread_runtime / parallel_runtime, 2) + print( + TEST_OUTPUT_TEMPLATE.format( + operation_name, operations, single_thread_runtime, parallel_runtime, speedup + ) + ) + + +if __name__ == "__main__": + tvm.testing.main() From f19046caba1a87fb21a832dff74cd80699703576 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Mon, 19 Sep 2022 10:04:16 -0700 Subject: [PATCH 207/704] [MetaSchedule] Support padding for irregular shapes for CUDA tensor core (#12759) * [MetaSchedule] Support padding for irregular shapes for CUDA tensor core * fix * Fix test --- python/tvm/tir/schedule/analysis.py | 7 +- python/tvm/tir/schedule/transform.py | 8 +- .../multi_level_tiling_tensor_core.cc | 3 +- src/tir/schedule/analysis.h | 8 +- src/tir/schedule/analysis/analysis.cc | 53 +++++-- src/tir/schedule/transform.cc | 10 +- src/tir/schedule/transform.h | 3 +- ...test_meta_schedule_schedule_rule_mlt_tc.py | 149 +++++++++++++++++- .../unittest/test_tir_schedule_analysis.py | 29 +++- 9 files changed, 242 insertions(+), 28 deletions(-) diff --git a/python/tvm/tir/schedule/analysis.py b/python/tvm/tir/schedule/analysis.py index cdb4aa9cfa20..90c585ac8ce1 100644 --- a/python/tvm/tir/schedule/analysis.py +++ b/python/tvm/tir/schedule/analysis.py @@ -68,7 +68,7 @@ class TensorizeInfo(Object): def get_tensorize_loop_mapping( - sch: Schedule, block: BlockRV, desc_func: PrimFunc + sch: Schedule, block: BlockRV, desc_func: PrimFunc, allow_padding: bool = False ) -> Optional[TensorizeInfo]: """Establish a mapping between loops in a target block and an intrinsic description @@ -80,13 +80,14 @@ def get_tensorize_loop_mapping( The target block to match against desc_func : PrimFunc The prim func describing the computation to be tensorized - + allow_padding : bool + Whether to allow padding the block iters to match the intrinsic description Returns ------- tensorize_info : Optional[TensorizeInfo] TensorizeInfo structure if a valid mapping is found, None otherwise """ - return _ffi_api.GetTensorizeLoopMapping(sch, block, desc_func) # type: ignore + return _ffi_api.GetTensorizeLoopMapping(sch, block, desc_func, allow_padding) # type: ignore @tvm._ffi.register_object("tir.schedule.AutoTensorizeMappingInfo") diff --git a/python/tvm/tir/schedule/transform.py b/python/tvm/tir/schedule/transform.py index 5dbc06846d52..e40b55d4d6b2 100644 --- a/python/tvm/tir/schedule/transform.py +++ b/python/tvm/tir/schedule/transform.py @@ -21,7 +21,9 @@ from . import _ffi_api -def tile_with_tensor_intrin(sch: Schedule, block: BlockRV, intrin_name: str) -> Optional[LoopRV]: +def tile_with_tensor_intrin( + sch: Schedule, block: BlockRV, intrin_name: str, allow_padding: bool = False +) -> Optional[LoopRV]: """Tile a subset of loops in the block according to the given tensor intrinsic. Parameters @@ -32,6 +34,8 @@ def tile_with_tensor_intrin(sch: Schedule, block: BlockRV, intrin_name: str) -> The block whose subset of loops will be tiled intrin_name : str The name of a tensor intrinsic, must be registerd via TensorIntrin.register(...) beforehand + allow_padding : bool + Whether to allow padding when tiling Returns ------- @@ -39,4 +43,4 @@ def tile_with_tensor_intrin(sch: Schedule, block: BlockRV, intrin_name: str) -> LoopRV corresponding to the outermost loop of a block tiled according to the given intrin NullOpt if no valid loop mapping is found """ - return _ffi_api.TileWithTensorIntrin(sch, block, intrin_name) # type: ignore + return _ffi_api.TileWithTensorIntrin(sch, block, intrin_name, allow_padding) # type: ignore diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc index 6759b59a3245..290a85b2579b 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc @@ -515,7 +515,8 @@ Optional MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin( state->sch->TransformBlockLayout(state->tensor_core_reindex_B, index_map); state->sch->TransformBlockLayout(state->block_rv, index_map); - return tir::TileWithTensorIntrin(state->sch, state->block_rv, intrin_name); + return tir::TileWithTensorIntrin(state->sch, state->block_rv, intrin_name, + /*allow_padding=*/true); } inline std::vector MultiLevelTilingTensorCoreNode::TransformForTensorization( diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h index ca45bcac6b34..57165fd08ad4 100644 --- a/src/tir/schedule/analysis.h +++ b/src/tir/schedule/analysis.h @@ -731,10 +731,15 @@ class TensorizeInfoNode : public Object { Map loop_map; /*! \brief Maps loops in an intrinsic description to its index, outer to inner */ Map desc_loop_indexer; + /*! \brief Optional padded extents of the block iters when padding is needed to match the + * intrinsic description + */ + Optional> block_iter_paddings; void VisitAttrs(AttrVisitor* v) { v->Visit("loop_map", &loop_map); v->Visit("desc_loop_indexer", &desc_loop_indexer); + v->Visit("block_iter_paddings", &block_iter_paddings); } static constexpr const char* _type_key = "tir.schedule.TensorizeInfo"; @@ -751,11 +756,12 @@ class TensorizeInfo : public ObjectRef { * \param self The schedule state to be tensorized * \param block_sref The target block to match against * \param desc_func The prim func describing the computation to be tensorized + * \param allow_padding Whether to allow padding the block iters to match the intrinsic description * \return TensorizeInfo structure if a valid mapping is found, NullOpt otherwise */ Optional GetTensorizeLoopMapping(const tir::ScheduleState& self, const tir::StmtSRef& block_sref, - const tir::PrimFunc& desc_func); + const tir::PrimFunc& desc_func, bool allow_padding); /*!\brief Necessary information used to perform transformations for tensorization */ class AutoTensorizeMappingInfoNode : public Object { diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc index e39f7b25543c..294826a1f6b9 100644 --- a/src/tir/schedule/analysis/analysis.cc +++ b/src/tir/schedule/analysis/analysis.cc @@ -1697,7 +1697,8 @@ TensorIntrinDescInfo ExtractTensorIntrinDescInfo(arith::Analyzer* analyzer, Optional GetTensorizeLoopMapping(const tir::ScheduleState& self, const tir::StmtSRef& block_sref, - const tir::PrimFunc& desc_func) { + const tir::PrimFunc& desc_func, + bool allow_padding) { arith::Analyzer analyzer; const tir::BlockRealize& block = tir::GetBlockRealize(self, block_sref); // Step 1. Analyze desc_func, extract its block, loops and loop vars @@ -1730,6 +1731,8 @@ Optional GetTensorizeLoopMapping(const tir::ScheduleState& self, const int n_desc_vars = desc_block->iter_values.size(); const int offset = n_block_vars - n_desc_vars; + std::unordered_map block_index_to_padding; // padding of each block iter if necessary + if (offset < 0) { return NullOpt; } @@ -1780,10 +1783,11 @@ Optional GetTensorizeLoopMapping(const tir::ScheduleState& self, // Step 3.2. Find the corresponding iter_value of the target block with a matching iterator type PrimExpr block_bind; - for (int i = next_block_ind; i >= 0; --i) { - if (iter_types_block[i] == iter_type_desc) { - next_block_ind = i - 1; - block_bind = block->iter_values[i]; + int current_block_ind = next_block_ind; + for (; current_block_ind >= 0; --current_block_ind) { + if (iter_types_block[current_block_ind] == iter_type_desc) { + next_block_ind = current_block_ind - 1; + block_bind = block->iter_values[current_block_ind]; break; } } @@ -1800,15 +1804,30 @@ Optional GetTensorizeLoopMapping(const tir::ScheduleState& self, PrimExpr residual = analyzer.Simplify(block_bind - block_loops[i]->loop_var); if (UsesVar(residual, - [&block_loop_vars](const VarNode* var) { return block_loop_vars.count(var); })) + [&block_loop_vars](const VarNode* var) { return block_loop_vars.count(var); })) { continue; + } + // padding is allowed only when the block has trivial bindings + if (allow_padding && !is_zero(residual)) { + allow_padding = false; + } const IntImmNode* int_block_extent = block_loops[i]->extent.as(); // Check divisibility - if (!int_block_extent || int_block_extent->value % int_desc_extent->value != 0) { + if (!int_block_extent) { return NullOpt; } + int64_t remainder = int_block_extent->value % int_desc_extent->value; + if (remainder != 0) { + if (allow_padding) { + // If the block loop is not divisible by the desc loop, we pad the block loop to make it + // divisible if padding is allowed. + block_index_to_padding[current_block_ind] = int_desc_extent->value - remainder; + } else { + return NullOpt; + } + } ret->loop_map.Set(block_loop_sref, GetRef(desc_loop)); break; @@ -1818,13 +1837,29 @@ Optional GetTensorizeLoopMapping(const tir::ScheduleState& self, for (int i = 0, n = desc_loops.size(); i < n; ++i) { ret->desc_loop_indexer.Set(GetRef(desc_loops[i]), Integer(i)); } + if (!block_index_to_padding.empty()) { + if (!allow_padding) { + return NullOpt; + } + Array paddings; + for (int i = 0, n = block->block->iter_vars.size(); i < n; ++i) { + const IterVar& iter_var = block->block->iter_vars[i]; + if (auto it = block_index_to_padding.find(i); it != block_index_to_padding.end()) { + paddings.push_back(IntImm(iter_var->var.dtype(), it->second)); + } else { + paddings.push_back(IntImm(iter_var->var.dtype(), 0)); + } + } + ret->block_iter_paddings = std::move(paddings); + } + return TensorizeInfo(ret); } TVM_REGISTER_GLOBAL("tir.schedule.IsSpatialPrimFunc").set_body_typed(IsSpatialPrimFunc); TVM_REGISTER_GLOBAL("tir.schedule.GetTensorizeLoopMapping") - .set_body_typed([](Schedule sch, BlockRV block, PrimFunc desc_func) { - return GetTensorizeLoopMapping(sch->state(), sch->GetSRef(block), desc_func); + .set_body_typed([](Schedule sch, BlockRV block, PrimFunc desc_func, bool allow_padding) { + return GetTensorizeLoopMapping(sch->state(), sch->GetSRef(block), desc_func, allow_padding); }); /******** Auto Tensorization ********/ diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc index dfbd3dbcbcc4..b00005c58061 100644 --- a/src/tir/schedule/transform.cc +++ b/src/tir/schedule/transform.cc @@ -288,11 +288,15 @@ void LeafBlockRemovalPlan(const ScheduleState& self, const StmtSRef& leaf_block_ } Optional TileWithTensorIntrin(const tir::Schedule& sch, const tir::BlockRV& block_rv, - const String& intrin_name) { - Optional opt_tensorize_info = GetTensorizeLoopMapping( - sch->state(), sch->GetSRef(block_rv), tir::TensorIntrin::Get(intrin_name)->desc); + const String& intrin_name, bool allow_padding) { + Optional opt_tensorize_info = + GetTensorizeLoopMapping(sch->state(), sch->GetSRef(block_rv), + tir::TensorIntrin::Get(intrin_name)->desc, allow_padding); if (!opt_tensorize_info) return NullOpt; const tir::TensorizeInfoNode* info = opt_tensorize_info.value().get(); + if (info->block_iter_paddings.defined()) { + sch->PadEinsum(block_rv, info->block_iter_paddings.value()); + } // Construct a mapping from tir loops back to LoopRVs Map loop2rv; { diff --git a/src/tir/schedule/transform.h b/src/tir/schedule/transform.h index 4de3685e2482..2bba13e2bd1c 100644 --- a/src/tir/schedule/transform.h +++ b/src/tir/schedule/transform.h @@ -193,11 +193,12 @@ void LeafBlockRemovalPlan(const ScheduleState& self, const StmtSRef& leaf_block_ * \param block_rv The block whose subset of loops will be tiled * \param intrin_name The name of a tensor intrinsic, must be registerd via * TensorIntrin.register(...) beforehand + * \param allow_padding Whether to allow padding when tiling * \return LoopRV corresponding to the outermost loop of a * block tiled according to the given intrin, NullOpt if a valid loop mapping is not found */ Optional TileWithTensorIntrin(const tir::Schedule& sch, const tir::BlockRV& block_rv, - const String& intrin_name); + const String& intrin_name, bool allow_padding = false); /******** Block mutation ********/ diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py index fbb74090b1e5..f7a5ce997edf 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_tc.py @@ -16,6 +16,7 @@ # under the License. # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring import tvm +import tvm.testing from tvm import meta_schedule as ms from tvm import te from tvm.meta_schedule.testing import te_workload @@ -947,11 +948,145 @@ def test_matmul_relu_non_tensorizable(): tvm.ir.assert_structural_equal(mod, sch.mod["main"]) +def test_padded_matmul_relu(): + # fmt: off + @T.prim_func + def padded_matmul_relu_0(A: T.Buffer[(127, 127), "float16"], B: T.Buffer[(127, 127), "float16"], compute: T.Buffer[(127, 127), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + C_reindex_shared = T.alloc_buffer([128, 128], dtype="float32", scope="shared") + C_reindex_shared_wmma_accumulator = T.alloc_buffer([128, 128], dtype="float32", scope="wmma.accumulator") + A_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared") + B_reindex_shared = T.alloc_buffer([128, 128], dtype="float16", scope="shared") + A_reindex_shared_wmma_matrix_a = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_a") + B_reindex_shared_wmma_matrix_b = T.alloc_buffer([128, 128], dtype="float16", scope="wmma.matrix_b") + for ax0_0_0_ax1_0_0_fused in T.thread_binding(8, thread="blockIdx.y"): + for ax0_0_1_ax1_0_1_fused in T.thread_binding(2, thread="blockIdx.x"): + for ax0_0_2_ax1_0_2_fused in T.thread_binding(2, thread="threadIdx.y"): + for ax2_0_0 in T.serial(1): + for ax0_ax1_fused in T.serial(4096): + with T.block("A_reindex_shared"): + v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0_ax1_fused // 128) + v1 = T.axis.spatial(128, ax0_ax1_fused % 128) + T.reads(A[v0, v1]) + T.writes(A_reindex_shared[v0, v1]) + T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":8}) + A_reindex_shared[v0, v1] = T.if_then_else(v0 < 127 and v1 < 127, A[v0, v1], T.float16(0), dtype="float16") + for ax0_ax1_fused in T.serial(4096): + with T.block("B_reindex_shared"): + v0 = T.axis.spatial(128, ax0_ax1_fused // 32) + v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax0_ax1_fused % 32) + T.reads(B[v0, v1]) + T.writes(B_reindex_shared[v0, v1]) + T.block_attr({"buffer_dim_align":[[0, 0, 32, 8]], "meta_schedule.cooperative_fetch":1}) + B_reindex_shared[v0, v1] = T.if_then_else(v0 < 127 and v1 < 127, B[v0, v1], T.float16(0), dtype="float16") + for ax2_0_1 in T.serial(4): + for ax0_0, ax1_0 in T.grid(2, 2): + with T.block("A_reindex_shared_wmma.matrix_a_o"): + v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused // 2 * 2 + ax0_0) + v1_o = T.axis.spatial(8, ax2_0_1 * 2 + ax1_0) + T.reads(A_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_a"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("A_reindex_shared_wmma.matrix_a"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = A_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0_0, ax1_0 in T.grid(2, 1): + with T.block("B_reindex_shared_wmma.matrix_b_o"): + v0_o = T.axis.spatial(8, ax2_0_1 * 2 + ax0_0) + v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused) + T.reads(B_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_load_16x16x16_f16_b"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("B_reindex_shared_wmma.matrix_b"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + B_reindex_shared_wmma_matrix_b[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = B_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0_0_3, ax1_0_3, ax2_0_2, ax0_0_4, ax1_0_4 in T.grid(1, 1, 2, 2, 1): + with T.block("C_o"): + v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused // 2 * 2 + ax0_0_3 * 2 + ax0_0_4) + v1_o = T.axis.spatial(8, ax1_0_4 + ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused + ax1_0_3) + v2_o = T.axis.reduce(8, ax2_0_0 * 8 + ax2_0_1 * 2 + ax2_0_2) + T.reads(A_reindex_shared_wmma_matrix_a[v0_o * 16 : v0_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16], B_reindex_shared_wmma_matrix_b[v2_o * 16 : v2_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_sync_16x16x16_f16f16f32", "meta_schedule.auto_tensorize_init":"wmma_fill_16x16x16_f32", "warp_execution":1}) + with T.init(): + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("C_init"): + v0_i_init, v1_i_init = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads() + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init]) + C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i_init, v1_o * 16 + v1_i_init] = T.float32(0) + for ax0_1, ax1_1, ax2_1 in T.grid(16, 16, 16): + with T.block("C"): + v0_i, v1_i, v2_i = T.axis.remap("SSR", [ax0_1, ax1_1, ax2_1]) + T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i], A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i]) + T.writes(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.block_attr({"meta_schedule.tiling_structure":"SSSRRSRS"}) + C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + T.cast(A_reindex_shared_wmma_matrix_a[v0_o * 16 + v0_i, v2_o * 16 + v2_i], "float32") * T.cast(B_reindex_shared_wmma_matrix_b[v2_o * 16 + v2_i, v1_o * 16 + v1_i], "float32") + for ax0_0, ax1_0 in T.grid(2, 1): + with T.block("C_reindex_shared_wmma.accumulator_o"): + v0_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused // 2 * 2 + ax0_0) + v1_o = T.axis.spatial(8, ax0_0_0_ax1_0_0_fused % 2 * 4 + ax0_0_1_ax1_0_1_fused * 2 + ax0_0_2_ax1_0_2_fused) + T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.writes(C_reindex_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]) + T.block_attr({"meta_schedule.auto_tensorize":"wmma_store_16x16x16_f32_shared"}) + for ax0_1, ax1_1 in T.grid(16, 16): + with T.block("C_reindex_shared_wmma.accumulator"): + v0_i, v1_i = T.axis.remap("SS", [ax0_1, ax1_1]) + T.reads(C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + T.writes(C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i]) + C_reindex_shared[v0_o * 16 + v0_i, v1_o * 16 + v1_i] = C_reindex_shared_wmma_accumulator[v0_o * 16 + v0_i, v1_o * 16 + v1_i] + for ax0, ax1 in T.grid(32, 32): + with T.block("C_reindex_shared"): + T.where(ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0 < 127 and ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax1 < 127) + v0 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused // 2 * 32 + ax0) + v1 = T.axis.spatial(128, ax0_0_0_ax1_0_0_fused % 2 * 64 + ax0_0_1_ax1_0_1_fused * 32 + ax1) + T.reads(C_reindex_shared[v0, v1]) + T.writes(compute[v0, v1]) + T.block_attr({"meta_schedule.cooperative_fetch":4}) + compute[v0, v1] = T.max(C_reindex_shared[v0, v1], T.float32(0)) + # fmt: on + + decision_0 = [ + ("SamplePerfectTile", [4, 1, 1, 1, 2]), + ("SamplePerfectTile", [2, 2, 2, 1, 1]), + ("SamplePerfectTile", [1, 4, 2]), + ("SampleCategorical", 3), + ("SampleCategorical", 3), + ("SampleCategorical", 0), + ] + + mod = te.create_prim_func( + te_workload.matmul_relu( + n=127, + m=127, + k=127, + in_dtype="float16", + out_dtype="float32", + ) + ) + actual = ms.TuneContext( + mod=mod, + target=tvm.target.Target("cuda"), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[multi_level_tiling_tensor_core(write_reuse_scope="shared")] + + get_rules("cuda", ms.schedule_rule.AutoInline), + ).generate_design_space() + check_sketches( + mod, + sketches=actual, + expected_mods=[padded_matmul_relu_0], + expected_decisions=[decision_0], + ) + + if __name__ == "__main__": - test_matmul_relu() - test_matmul_relu_with_fallback() - test_conv2d() - test_conv2d_more_intrin() - test_matmul_relu_pipeline() - test_matmul_relu_global() - test_matmul_relu_non_tensorizable() + tvm.testing.main() diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py index 378e5183b49c..807420ece3ba 100644 --- a/tests/python/unittest/test_tir_schedule_analysis.py +++ b/tests/python/unittest/test_tir_schedule_analysis.py @@ -21,7 +21,10 @@ import tvm.testing from tvm.tir.function import TensorIntrin from tvm.tir.tensor_intrin.x86 import dot_product_16x4_u8i8i32_desc -from tvm.tir.tensor_intrin.cuda import WMMA_SYNC_16x16x16_f16f16f32_INTRIN +from tvm.tir.tensor_intrin.cuda import ( + WMMA_SYNC_16x16x16_f16f16f16_INTRIN, + WMMA_SYNC_16x16x16_f16f16f32_INTRIN, +) from tvm.tir import Evaluate, For, ForKind, IndexMap, Var, decl_buffer, floordiv, floormod, Schedule @@ -301,6 +304,30 @@ def matmul_16x16x16xf16f16f16_desc( assert s.get(desc_loop_to_sref[desc_loops[2]]) == s.get(i2) +def test_get_tensorize_loop_mapping_padding_matmul(): + matmul = create_prim_func( + te_workload.matmul_relu( + n=127, + m=256, + k=65, + in_dtype="float16", + out_dtype="float16", + ) + ) + s = Schedule(matmul) + block = s.get_block("C") + + desc = TensorIntrin.get(WMMA_SYNC_16x16x16_f16f16f16_INTRIN).desc + info = get_tensorize_loop_mapping(s, block, desc, allow_padding=True) + assert info is not None + expected_padding = [1, 0, 15] + actual_padding = info.block_iter_paddings + assert actual_padding is not None + assert len(actual_padding) == len(expected_padding) + for actual, expected in zip(actual_padding, expected_padding): + assert actual == expected + + def check_index_map(workload, block_name, intrin_name, expected_index_map): s = Schedule(workload) block = s.get_block(block_name) From 79c48f38878788b46d3acd1945469ae97e508d7d Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Mon, 19 Sep 2022 13:02:53 -0500 Subject: [PATCH 208/704] [TIR][Bugfix] Correct handling of buffer argument when scheduling (#12816) Follow-up from https://github.com/apache/tvm/pull/11269, which allowed schedule arguments of the buffer to be transformed to be specified as a string, or as a `tir::Buffer`. The string handling worked correctly, but the `tir::Buffer` object was handled incorrectly. This commit corrects the handling of `tir::Buffer` arguments when scheduling, and adds a unit test to validate this behavior. --- python/tvm/tir/schedule/schedule.py | 6 +-- .../test_tir_schedule_set_axis_separator.py | 41 +++++++++++++------ 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py index b8f696b7a134..27171aca411b 100644 --- a/python/tvm/tir/schedule/schedule.py +++ b/python/tvm/tir/schedule/schedule.py @@ -2390,7 +2390,7 @@ def iter_buffers(): if isinstance(buffer, str): possible_buffers = {} # String lookup requires ensuring that the name is unique - for buffer_index, buffer_index_type, buf in iter_buffers(): + for buffer_index_type, buffer_index, buf in iter_buffers(): if buf.name == buffer: possible_buffers[buf] = (buffer_index_type, buffer_index) @@ -2398,12 +2398,12 @@ def iter_buffers(): assert ( len(possible_buffers) == 1 ), f"Multiple buffers named '{buffer}' in block '{block_name}'" - buffer_obj, (buffer_index, buffer_index_type) = next(iter(possible_buffers.items())) + buffer_obj, (buffer_index_type, buffer_index) = next(iter(possible_buffers.items())) elif isinstance(buffer, Buffer): # Buffer lookup has unique id, can break out early found = False - for buffer_index, buffer_index_type, buffer_obj in iter_buffers(): + for buffer_index_type, buffer_index, buffer_obj in iter_buffers(): if buffer_obj.same_as(buffer): found = True break diff --git a/tests/python/unittest/test_tir_schedule_set_axis_separator.py b/tests/python/unittest/test_tir_schedule_set_axis_separator.py index b432fbb61066..327df33408f2 100644 --- a/tests/python/unittest/test_tir_schedule_set_axis_separator.py +++ b/tests/python/unittest/test_tir_schedule_set_axis_separator.py @@ -102,18 +102,25 @@ def element_wise_subregion_match_set_axis_separator(A: T.Buffer[(128, 128), "flo # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg -use_sugared_transform = tvm.testing.parameter( - by_dict={"set_axis_separators": False, "transform_layout_sugared": True} -) +argument_style = tvm.testing.parameter('set_axis_separators', + 'transform_layout_named', + 'transform_layout_buffer_object', + ) -def test_set_axis_separator(use_sugared_transform): + +def test_set_axis_separator(argument_style): func = element_wise s = tir.Schedule(func, debug_mask='all') - if use_sugared_transform: + if argument_style=='set_axis_separators': s.set_axis_separator(s.get_block("B"), ("write",0), [1]) - else: + elif argument_style=='transform_layout_named': s.transform_layout(block='B', buffer='B', index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j]) + elif argument_style =='transform_layout_buffer_object': + B = s.get(s.get_block('B')).writes[0].buffer + s.transform_layout(block='B', buffer=B, index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j]) + else: + raise ValueError(f'Unexpected argument_style: {argument_style}') tvm.ir.assert_structural_equal(element_wise_set_axis_separator, s.mod["main"]) verify_trace_roundtrip(sch=s, mod=func) @@ -128,28 +135,38 @@ def test_set_scope_fail_on_index_out_of_bound(): s.set_axis_separator(s.get_block("B"), ("read",-1),[1]) -def test_set_axis_separator_input_buffer(use_sugared_transform): +def test_set_axis_separator_input_buffer(argument_style): func = element_wise s = tir.Schedule(func, debug_mask='all') - if use_sugared_transform: + if argument_style=='set_axis_separators': + s.set_axis_separator(s.get_block("B"), ("read",0), [1]) + elif argument_style=='transform_layout_named': s.transform_layout(block='B', buffer='A', index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j]) + elif argument_style =='transform_layout_buffer_object': + A = s.get(s.get_block('B')).reads[0].buffer + s.transform_layout(block='B', buffer=A, index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j]) else: - s.set_axis_separator(s.get_block("B"), ("read",0), [1]) + raise ValueError(f'Unexpected argument_style: {argument_style}') tvm.ir.assert_structural_equal(element_wise_set_axis_separator_input_buffer, s.mod["main"]) verify_trace_roundtrip(sch=s, mod=func) -def test_set_axis_separator_subregion(use_sugared_transform): +def test_set_axis_separator_subregion(argument_style): func = element_wise_subregion_match s = tir.Schedule(func, debug_mask='all') - if use_sugared_transform: + if argument_style=='set_axis_separators': + s.set_axis_separator(s.get_block("B"), ("write",0), [1]) + elif argument_style=='transform_layout_named': s.transform_layout(block='B', buffer='B', index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j]) + elif argument_style =='transform_layout_buffer_object': + B = s.get(s.get_block('B')).writes[0].buffer + s.transform_layout(block='B', buffer=B, index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j]) else: - s.set_axis_separator(s.get_block("B"), ("write",0), [1]) + raise ValueError(f'Unexpected argument_style: {argument_style}') tvm.ir.assert_structural_equal(element_wise_subregion_match_set_axis_separator, s.mod["main"]) verify_trace_roundtrip(sch=s, mod=func) From f9b692765adf19a2bd3e5cf7abab8c1c74714f81 Mon Sep 17 00:00:00 2001 From: yanghaku <36074633+yanghaku@users.noreply.github.com> Date: Tue, 20 Sep 2022 06:32:58 +0800 Subject: [PATCH 209/704] [BugFix][LLVM] Fix the bug that the generated systemlib cannot register ```__tvm_module_ctx``` symbol sometimes (#12817) [BugFix][LLVM] Fix the bug that the generated systemlib cannot register '__tvm_module_ctx' symbol sometimes. --- src/target/llvm/codegen_cpu.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc index 57ee62e152db..eb5c92e663fa 100644 --- a/src/target/llvm/codegen_cpu.cc +++ b/src/target/llvm/codegen_cpu.cc @@ -178,9 +178,9 @@ void CodeGenCPU::Init(const std::string& module_name, LLVMTarget* llvm_target, b llvm::Function::Create(ftype_tvm_parallel_barrier_, llvm::Function::ExternalLinkage, "TVMBackendParallelBarrier", module_.get()); } - InitGlobalContext(dynamic_lookup); target_c_runtime_ = target_c_runtime; is_system_lib_ = system_lib; + InitGlobalContext(dynamic_lookup); } void CodeGenCPU::AddFunction(const PrimFunc& f) { From a75dcabd3f5306ed1c792c0877becab219004ed8 Mon Sep 17 00:00:00 2001 From: Xiyou Zhou Date: Mon, 19 Sep 2022 17:20:53 -0700 Subject: [PATCH 210/704] [MetaSchedule] PyDatabase Complete Function Reload Support (#12838) * Save for PR. * Fix database default query function call. * Add test. * Fix lint. * Remove unused import. * Differentiate override class. * Reuse outer class functions. * Fix lint. --- include/tvm/meta_schedule/database.h | 70 ++++++ python/tvm/meta_schedule/database/database.py | 81 +++++++ src/meta_schedule/database/database.cc | 6 + .../unittest/test_meta_schedule_database.py | 211 +++++++++++++++++- 4 files changed, 365 insertions(+), 3 deletions(-) diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h index fa488a38ce0a..4092fdae36dd 100644 --- a/include/tvm/meta_schedule/database.h +++ b/include/tvm/meta_schedule/database.h @@ -28,6 +28,7 @@ #include #include #include +#include #include namespace tvm { @@ -267,6 +268,33 @@ class PyDatabaseNode : public DatabaseNode { * \return An Array of all the tuning records in the database. */ using FGetAllTuningRecords = runtime::TypedPackedFunc()>; + /*! + * \brief The function type of `QueryTuningRecord` method. + * \param mod The IRModule to be searched for. + * \param target The target to be searched for. + * \param workload_name The name of the workload to be searched for. + * \return The best record of the given workload; NullOpt if not found. + */ + using FQueryTuningRecord = runtime::TypedPackedFunc( + const IRModule&, const Target&, const String&)>; + /*! + * \brief The function type of `QuerySchedule` method. + * \param mod The IRModule to be searched for. + * \param target The target to be searched for. + * \param workload_name The name of the workload to be searched for. + * \return The schedule in the best schedule of the given workload; NullOpt if not found. + */ + using FQuerySchedule = runtime::TypedPackedFunc( + const IRModule&, const Target&, const String&)>; + /*! + * \brief The function type of `QueryIRModule` method. + * \param mod The IRModule to be searched for. + * \param target The target to be searched for. + * \param workload_name The name of the workload to be searched for. + * \return The IRModule in the best IRModule of the given workload; NullOpt if not found. + */ + using FQueryIRModule = + runtime::TypedPackedFunc(const IRModule&, const Target&, const String&)>; /*! * \brief The function type of `Size` method. * \return The size of the database. @@ -283,6 +311,12 @@ class PyDatabaseNode : public DatabaseNode { FGetTopK f_get_top_k; /*! \brief The packed function to the `GetAllTuningRecords` function. */ FGetAllTuningRecords f_get_all_tuning_records; + /*! \brief The packed function to the `QueryTuningRecord` function. */ + FQueryTuningRecord f_query_tuning_record; + /*! \brief The packed function to the `QuerySchedule` function. */ + FQuerySchedule f_query_schedule; + /*! \brief The packed function to the `QueryIRModule` function. */ + FQueryIRModule f_query_ir_module; /*! \brief The packed function to the `Size` function. */ FSize f_size; @@ -295,6 +329,9 @@ class PyDatabaseNode : public DatabaseNode { // `f_commit_tuning_record` is not visited // `f_get_top_k` is not visited // `f_get_all_tuning_records` is not visited + // `f_query_tuning_record` is not visited + // `f_query_schedule` is not visited + // `f_query_ir_module` is not visited // `f_size` is not visited } @@ -325,6 +362,33 @@ class PyDatabaseNode : public DatabaseNode { return f_get_all_tuning_records(); } + Optional QueryTuningRecord(const IRModule& mod, const Target& target, + const String& workload_name) final { + if (f_query_tuning_record == nullptr) { + return DatabaseNode::QueryTuningRecord(mod, target, workload_name); + } else { + return f_query_tuning_record(mod, target, workload_name); + } + } + + Optional QuerySchedule(const IRModule& mod, const Target& target, + const String& workload_name) final { + if (f_query_schedule == nullptr) { + return DatabaseNode::QuerySchedule(mod, target, workload_name); + } else { + return f_query_schedule(mod, target, workload_name); + } + } + + Optional QueryIRModule(const IRModule& mod, const Target& target, + const String& workload_name) final { + if (f_query_ir_module == nullptr) { + return DatabaseNode::QueryIRModule(mod, target, workload_name); + } else { + return f_query_ir_module(mod, target, workload_name); + } + } + int64_t Size() final { ICHECK(f_size != nullptr) << "PyDatabase's Size method not implemented!"; return f_size(); @@ -380,6 +444,9 @@ class Database : public runtime::ObjectRef { * \param f_commit_tuning_record The packed function of `CommitTuningRecord`. * \param f_get_top_k The packed function of `GetTopK`. * \param f_get_all_tuning_records The packed function of `GetAllTuningRecords`. + * \param f_query_tuning_record The packed function of `QueryTuningRecord`. + * \param f_query_schedule The packed function of `QuerySchedule`. + * \param f_query_ir_module The packed function of `QueryIRModule`. * \param f_size The packed function of `Size`. * \return The created database. */ @@ -388,6 +455,9 @@ class Database : public runtime::ObjectRef { PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record, PyDatabaseNode::FGetTopK f_get_top_k, PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records, + PyDatabaseNode::FQueryTuningRecord f_query_tuning_record, + PyDatabaseNode::FQuerySchedule f_query_schedule, + PyDatabaseNode::FQueryIRModule f_query_ir_module, PyDatabaseNode::FSize f_size); /*! \return The current Database in the scope. */ static Optional Current(); diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py index 7a1338f46b20..75b78b118eea 100644 --- a/python/tvm/meta_schedule/database/database.py +++ b/python/tvm/meta_schedule/database/database.py @@ -378,6 +378,9 @@ def __init__( f_commit_tuning_record: Callable = None, f_get_top_k: Callable = None, f_get_all_tuning_records: Callable = None, + f_query_tuning_record: Callable = None, + f_query_schedule: Callable = None, + f_query_ir_module: Callable = None, f_size: Callable = None, ): """Constructor.""" @@ -389,6 +392,9 @@ def __init__( f_commit_tuning_record, f_get_top_k, f_get_all_tuning_records, + f_query_tuning_record, + f_query_schedule, + f_query_ir_module, f_size, ) @@ -409,6 +415,9 @@ class PyDatabase: "commit_tuning_record", "get_top_k", "get_all_tuning_records", + "query_tuning_record", + "query_schedule", + "query_ir_module", "__len__", ], } @@ -478,6 +487,78 @@ def get_all_tuning_records(self) -> List[TuningRecord]: """ raise NotImplementedError + def query_tuning_record( + self, mod: IRModule, target: Target, workload_name: Optional[str] = None + ) -> Optional[TuningRecord]: + """Query a tuning record from the database. + + Parameters + ---------- + mod : IRModule + The IRModule to be searched for. + target : Target + The target to be searched for. + workload_name : Optional[str] + The workload name to be searched for. + + Returns + ------- + record : Optional[TuningRecord] + The tuning record corresponding to the given workload. + """ + # Using self._outer to replace the self pointer + return _ffi_api.DatabaseQueryTuningRecord( # type: ignore # pylint: disable=no-member + self._outer(), mod, target, workload_name # type: ignore # pylint: disable=no-member + ) + + def query_schedule( + self, mod: IRModule, target: Target, workload_name: Optional[str] = None + ) -> Optional[Schedule]: + """Query a schedule from the database. + + Parameters + ---------- + mod : IRModule + The IRModule to be searched for. + target : Target + The target to be searched for. + workload_name : Optional[str] + The workload name to be searched for. + + Returns + ------- + schedule : Optional[Schedule] + The schedule corresponding to the given workload. + """ + # Using self._outer to replace the self pointer + return _ffi_api.DatabaseQuerySchedule( # type: ignore # pylint: disable=no-member + self._outer(), mod, target, workload_name # type: ignore # pylint: disable=no-member + ) + + def query_ir_module( + self, mod: IRModule, target: Target, workload_name: Optional[str] = None + ) -> Optional[IRModule]: + """Query an IRModule from the database. + + Parameters + ---------- + mod : IRModule + The IRModule to be searched for. + target : Target + The target to be searched for. + workload_name : Optional[str] + The workload name to be searched for. + + Returns + ------- + mod : Optional[IRModule] + The IRModule corresponding to the given workload. + """ + # Using self._outer to replace the self pointer + return _ffi_api.DatabaseQueryIRModule( # type: ignore # pylint: disable=no-member + self._outer(), mod, target, workload_name # type: ignore # pylint: disable=no-member + ) + def __len__(self) -> int: """Get the number of records in the database. diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc index d082ff7a3901..0976e158aaf0 100644 --- a/src/meta_schedule/database/database.cc +++ b/src/meta_schedule/database/database.cc @@ -217,6 +217,9 @@ Database Database::PyDatabase(PyDatabaseNode::FHasWorkload f_has_workload, PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record, PyDatabaseNode::FGetTopK f_get_top_k, PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records, + PyDatabaseNode::FQueryTuningRecord f_query_tuning_record, + PyDatabaseNode::FQuerySchedule f_query_schedule, + PyDatabaseNode::FQueryIRModule f_query_ir_module, PyDatabaseNode::FSize f_size) { ObjectPtr n = make_object(); n->f_has_workload = f_has_workload; @@ -224,6 +227,9 @@ Database Database::PyDatabase(PyDatabaseNode::FHasWorkload f_has_workload, n->f_commit_tuning_record = f_commit_tuning_record; n->f_get_top_k = f_get_top_k; n->f_get_all_tuning_records = f_get_all_tuning_records; + n->f_query_tuning_record = f_query_tuning_record; + n->f_query_schedule = f_query_schedule; + n->f_query_ir_module = f_query_ir_module; n->f_size = f_size; return Database(n); } diff --git a/tests/python/unittest/test_meta_schedule_database.py b/tests/python/unittest/test_meta_schedule_database.py index e6342f1c3536..777c5589a141 100644 --- a/tests/python/unittest/test_meta_schedule_database.py +++ b/tests/python/unittest/test_meta_schedule_database.py @@ -18,11 +18,13 @@ """Test Meta Schedule Database""" import os.path as osp import tempfile -from typing import Callable +from typing import Callable, Optional, List import tvm import tvm.testing +from tvm.target import Target from tvm import meta_schedule as ms +from tvm.meta_schedule.database import TuningRecord, Workload from tvm import tir from tvm.ir.module import IRModule from tvm.script import tir as T @@ -106,6 +108,123 @@ def _equal_record(a: ms.database.TuningRecord, b: ms.database.TuningRecord): assert str(arg0.as_json()) == str(arg1.as_json()) +@ms.utils.derived_object +class PyMemoryDatabaseDefault(ms.database.PyDatabase): + def __init__(self): + super().__init__() + self.tuning_records_: List[TuningRecord] = [] + self.workloads_: List[Workload] = [] + + def has_workload(self, mod: IRModule) -> bool: + for workload in self.workloads_: + if tvm.ir.structural_equal(mod, workload.mod): + return True + + def commit_workload(self, mod: IRModule) -> ms.database.Workload: + if self.has_workload(mod): + for workload in self.workloads_: + if tvm.ir.structural_equal(mod, workload.mod): + return workload + else: + workload = ms.database.Workload(mod) + self.workloads_.append(workload) + return workload + + def commit_tuning_record(self, record: TuningRecord) -> None: + self.tuning_records_.append(record) + + def get_all_tuning_records(self) -> List[TuningRecord]: + return self.tuning_records_ + + def get_top_k(self, workload: ms.database.Workload, top_k: int) -> List[TuningRecord]: + return sorted( + list( + filter( + lambda x: tvm.ir.structural_equal(workload.mod, x.workload.mod), + self.tuning_records_, + ) + ), + key=lambda x: sum(x.run_secs) / len(x.run_secs) if x.run_secs else 1e9, + )[:top_k] + + def __len__(self) -> int: + return len(self.tuning_records_) + + +@ms.utils.derived_object +class PyMemoryDatabaseOverride(ms.database.PyDatabase): + def __init__(self): + super().__init__() + self.tuning_records_: List[TuningRecord] = [] + self.workloads_: List[Workload] = [] + + def has_workload(self, mod: IRModule) -> bool: + for workload in self.workloads_: + if tvm.ir.structural_equal(mod, workload.mod): + return True + + def commit_workload(self, mod: IRModule) -> ms.database.Workload: + if self.has_workload(mod): + for workload in self.workloads_: + if tvm.ir.structural_equal(mod, workload.mod): + return workload + else: + workload = ms.database.Workload(mod) + self.workloads_.append(workload) + return workload + + def commit_tuning_record(self, record: TuningRecord) -> None: + self.tuning_records_.append(record) + + def get_all_tuning_records(self) -> List[TuningRecord]: + return self.tuning_records_ + + def get_top_k(self, workload: ms.database.Workload, top_k: int) -> List[TuningRecord]: + return sorted( + list( + filter( + lambda x: tvm.ir.structural_equal(workload.mod, x.workload.mod), + self.tuning_records_, + ) + ), + key=lambda x: sum(x.run_secs) / len(x.run_secs) if x.run_secs else 1e9, + )[:top_k] + + def __len__(self) -> int: + return len(self.tuning_records_) + + def query_tuning_record( + self, mod: IRModule, target: Target, workload_name: Optional[str] = None + ) -> Optional[TuningRecord]: + if self.has_workload(mod): + records = self.get_top_k(self.commit_workload(mod), 2) + if len(records) == 1: + return records[0] + elif len(records) == 2: + return records[1] # return the 2nd best if there are two records + return None + + def query_schedule( + self, mod: IRModule, target: Target, workload_name: Optional[str] = None + ) -> Optional[Schedule]: + record = self.query_tuning_record(mod, target, workload_name) + if record is not None: + sch = Schedule(record.workload.mod) + record.trace.apply_to_schedule(sch, remove_postproc=False) + return sch + return None + + def query_ir_module( + self, mod: IRModule, target: Target, workload_name: Optional[str] = None + ) -> Optional[IRModule]: + record = self.query_tuning_record(mod, target, workload_name) + if record is not None: + sch = Schedule(record.workload.mod) + record.trace.apply_to_schedule(sch, remove_postproc=False) + return sch.mod + return None + + def test_meta_schedule_tuning_record_round_trip(): mod: IRModule = Matmul with tempfile.TemporaryDirectory() as tmpdir: @@ -302,10 +421,10 @@ def test_meta_schedule_database_union(): db_2 = ms.database.MemoryDatabase() trace = _create_schedule(mod, _schedule_matmul).trace - def query(db): + def query(db): # pylint: disable=invalid-name return db.query_tuning_record(mod=mod, target=target, workload_name="main").run_secs - def commit_record(db, run_sec): + def commit_record(db, run_sec): # pylint: disable=invalid-name db.commit_tuning_record( ms.database.TuningRecord( trace, @@ -331,5 +450,91 @@ def commit_record(db, run_sec): assert run_secs.value == 1.0 +def test_meta_schedule_pydatabase_default_query(): + + mod: IRModule = Matmul + target = tvm.target.Target("llvm") + arg_info = ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]) + db = PyMemoryDatabaseDefault() # pylint: disable=invalid-name + sch = _create_schedule(mod, _schedule_matmul) + trace = sch.trace + + def query(db, mod, target, kind): # pylint: disable=invalid-name + return db.query(mod=mod, target=target, workload_name="main", kind=kind) + + def commit_record(trace, db, run_sec): # pylint: disable=invalid-name + db.commit_tuning_record( + ms.database.TuningRecord( + trace, + workload=db.commit_workload(mod), + run_secs=[run_sec], + target=target, + args_info=arg_info, + ) + ) + + commit_record(trace, db, 1.0) + record = query(db, mod, target, "record") + assert record is not None and record.run_secs[0].value == 1.0 + sch_res = query(db, mod, target, "schedule") + assert sch_res is not None and tvm.ir.structural_equal(sch_res.mod, sch.mod) + mod_res = query(db, mod, target, "ir_module") + assert mod_res is not None and tvm.ir.structural_equal(mod_res, sch.mod) + + commit_record(Schedule(mod).trace, db, 0.2) # Empty Trace + record = query(db, mod, target, "record") + assert record is not None and record.run_secs[0].value == 0.2 + sch_res = query(db, mod, target, "schedule") + assert sch_res is not None and tvm.ir.structural_equal(sch_res.mod, mod) + mod_res = query(db, mod, target, "ir_module") + assert mod_res is not None and tvm.ir.structural_equal(mod_res, mod) + + +def test_meta_schedule_pydatabase_override_query(): + + mod: IRModule = Matmul + target = tvm.target.Target("llvm") + arg_info = ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]) + db = PyMemoryDatabaseOverride() # pylint: disable=invalid-name + sch = _create_schedule(mod, _schedule_matmul) + trace = sch.trace + + def query(db, mod, target, kind): # pylint: disable=invalid-name + return db.query(mod=mod, target=target, workload_name="main", kind=kind) + + def commit_record(trace, db, run_sec): # pylint: disable=invalid-name + db.commit_tuning_record( + ms.database.TuningRecord( + trace, + workload=db.commit_workload(mod), + run_secs=[run_sec], + target=target, + args_info=arg_info, + ) + ) + + commit_record(trace, db, 1.14) + record = query(db, mod, target, "record") + assert record is not None and record.run_secs[0].value == 1.14 + sch_res = query(db, mod, target, "schedule") + assert sch_res is not None and tvm.ir.structural_equal(sch_res.mod, sch.mod) + mod_res = query(db, mod, target, "ir_module") + assert mod_res is not None and tvm.ir.structural_equal(mod_res, sch.mod) + + commit_record(Schedule(mod).trace, db, 0.514) # Empty Trace + record = query(db, mod, target, "record") + assert record is not None and record.run_secs[0].value == 1.14 # Override to 2nd best + sch_res = query(db, mod, target, "schedule") + assert sch_res is not None and tvm.ir.structural_equal(sch_res.mod, sch.mod) + mod_res = query(db, mod, target, "ir_module") + assert mod_res is not None and tvm.ir.structural_equal(mod_res, sch.mod) + + +def test_meta_schedule_pydatabase_current(): + db = PyMemoryDatabaseDefault() # pylint: disable=invalid-name + with db: # pylint: disable=not-context-manager + assert ms.database.Database.current() == db + + if __name__ == "__main__": tvm.testing.main() From e18b48bed82da917e8e9a754217135bb5901f2a6 Mon Sep 17 00:00:00 2001 From: YudongChen <529641713@qq.com> Date: Tue, 20 Sep 2022 11:07:46 +0800 Subject: [PATCH 211/704] [Fix] naming outputs of graph nodes by op_name:output_index (#12809) to avoid fuzziness when the num of outputs per node is greater than 1. (#12672) Co-authored-by: victor.chen --- src/runtime/graph_executor/graph_executor.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc index fc7e82bed4e2..d805abfc658a 100644 --- a/src/runtime/graph_executor/graph_executor.cc +++ b/src/runtime/graph_executor/graph_executor.cc @@ -97,7 +97,9 @@ void GraphExecutor::Init(const std::string& graph_json, tvm::runtime::Module mod for (size_t i = 0; i < outputs_.size(); i++) { const uint32_t nid = outputs_[i].node_id; std::string& name = nodes_[nid].name; - output_map_[name] = i; + std::stringstream ss; + ss << name << ":" << i; + output_map_[ss.str()] = i; } } From 18909a4c135cb8df5125cf1e417de7a35e02e705 Mon Sep 17 00:00:00 2001 From: "Sevin F. Varoglu" Date: Tue, 20 Sep 2022 18:10:23 +0300 Subject: [PATCH 212/704] [ONNX] Upgrade onnx and onnxruntime (#12729) Upgrade onnx and onnxruntime to latest --- docker/install/ubuntu_install_onnx.sh | 4 +- tests/python/frontend/onnx/test_forward.py | 64 ++++++++++++++++++++++ 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/docker/install/ubuntu_install_onnx.sh b/docker/install/ubuntu_install_onnx.sh index 7bd15afd7eb3..d775875bc7c5 100755 --- a/docker/install/ubuntu_install_onnx.sh +++ b/docker/install/ubuntu_install_onnx.sh @@ -27,8 +27,8 @@ set -o pipefail # https://github.com/onnx/onnx/pull/2834). When updating the CI image # to onnx>=1.9, onnxoptimizer should also be installed. pip3 install \ - onnx==1.10.2 \ - onnxruntime==1.9.0 \ + onnx==1.12.0 \ + onnxruntime==1.12.1 \ onnxoptimizer==0.2.7 # torch depends on a number of other packages, but unhelpfully, does diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index 893815de7e5c..17a0513844ba 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -5233,6 +5233,10 @@ def verify_eyelike(indata, dynamic=False): "test_bernoulli_double_expanded", "test_bernoulli_seed", "test_bernoulli_seed_expanded", + "test_blackmanwindow", + "test_blackmanwindow_expanded", + "test_blackmanwindow_symmetric", + "test_blackmanwindow_symmetric_expanded", "test_cast_DOUBLE_to_FLOAT16", "test_cast_FLOAT_to_STRING", "test_cast_STRING_to_FLOAT", @@ -5262,19 +5266,61 @@ def verify_eyelike(indata, dynamic=False): "test_cumsum_2d_negative_axis", "test_det_2d", "test_det_nd", + "test_dft", + "test_dft_axis", + "test_dft_inverse", "test_dropout_default", "test_dropout_default_mask", "test_dropout_default_mask_ratio", "test_dropout_default_ratio", + "test_gridsample", + "test_gridsample_aligncorners_true", + "test_gridsample_bicubic", + "test_gridsample_bilinear", + "test_gridsample_border_padding", + "test_gridsample_nearest", + "test_gridsample_reflection_padding", + "test_gridsample_zeros_padding", "test_gru_batchwise", + "test_hammingwindow", + "test_hammingwindow_expanded", + "test_hammingwindow_symmetric", + "test_hammingwindow_symmetric_expanded", + "test_hannwindow", + "test_hannwindow_expanded", + "test_hannwindow_symmetric", + "test_hannwindow_symmetric_expanded", + "test_identity_opt", "test_identity_sequence", + "test_if_opt", "test_if_seq", + "test_layer_normalization_2d_axis0", + "test_layer_normalization_2d_axis1", + "test_layer_normalization_2d_axis_negative_1", + "test_layer_normalization_2d_axis_negative_2", + "test_layer_normalization_3d_axis0_epsilon", + "test_layer_normalization_3d_axis1_epsilon", + "test_layer_normalization_3d_axis2_epsilon", + "test_layer_normalization_3d_axis_negative_1_epsilon", + "test_layer_normalization_3d_axis_negative_2_epsilon", + "test_layer_normalization_3d_axis_negative_3_epsilon", + "test_layer_normalization_4d_axis0", + "test_layer_normalization_4d_axis1", + "test_layer_normalization_4d_axis2", + "test_layer_normalization_4d_axis3", + "test_layer_normalization_4d_axis_negative_1", + "test_layer_normalization_4d_axis_negative_2", + "test_layer_normalization_4d_axis_negative_3", + "test_layer_normalization_4d_axis_negative_4", + "test_layer_normalization_default_axis", "test_loop11", "test_loop13_seq", + "test_loop16_seq_none", "test_lstm_batchwise", "test_maxpool_with_argmax_2d_precomputed_pads", "test_maxpool_with_argmax_2d_precomputed_strides", "test_maxunpool_export_with_output_shape", + "test_melweightmatrix", # This test fails llvm with a lowering error: "test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded", "test_optional_has_element", @@ -5290,8 +5336,24 @@ def verify_eyelike(indata, dynamic=False): "test_reduce_sum_keepdims_random", "test_reduce_sum_negative_axes_keepdims_example", "test_reduce_sum_negative_axes_keepdims_random", + "test_roialign_aligned_true", + "test_scatter_elements_with_duplicate_indices", + "test_scatternd_add", + "test_scatternd_multiply", "test_sequence_insert_at_back", "test_sequence_insert_at_front", + "test_sequence_map_add_1_sequence_1_tensor", + "test_sequence_map_add_1_sequence_1_tensor_expanded", + "test_sequence_map_add_2_sequences", + "test_sequence_map_add_2_sequences_expanded", + "test_sequence_map_extract_shapes", + "test_sequence_map_extract_shapes_expanded", + "test_sequence_map_identity_1_sequence", + "test_sequence_map_identity_1_sequence_1_tensor", + "test_sequence_map_identity_1_sequence_1_tensor_expanded", + "test_sequence_map_identity_1_sequence_expanded", + "test_sequence_map_identity_2_sequences", + "test_sequence_map_identity_2_sequences_expanded", "test_simple_rnn_batchwise", "test_simple_rnn_defaults", "test_simple_rnn_with_initial_bias", @@ -5299,6 +5361,8 @@ def verify_eyelike(indata, dynamic=False): "test_split_variable_parts_2d", "test_split_variable_parts_default_axis", "test_split_zero_size_splits", + "test_stft", + "test_stft_with_window", "test_strnormalizer_export_monday_casesensintive_lower", "test_strnormalizer_export_monday_casesensintive_nochangecase", "test_strnormalizer_export_monday_casesensintive_upper", From ecd003c742da85d4945c7d02e9301e07ad413136 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Tue, 20 Sep 2022 08:10:59 -0700 Subject: [PATCH 213/704] [ci] Lint PR title/body for @ tags (#12840) This ensures that no users are tagged directly in PR titles or descriptions, which lets us finally turn this on https://github.blog/changelog/2022-08-23-new-options-for-controlling-the-default-commit-message-when-merging-a-pull-request/ --- ci/scripts/check_pr.py | 18 ++++++++---------- tests/python/ci/test_ci.py | 12 ++++++++++++ 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/ci/scripts/check_pr.py b/ci/scripts/check_pr.py index 9af5ec5580a3..8be5c0ee46a8 100755 --- a/ci/scripts/check_pr.py +++ b/ci/scripts/check_pr.py @@ -69,19 +69,17 @@ def trailing_period(s: str): title_checks = [ Check(check=non_empty, error_fn=lambda d: "PR must have a title but title was empty"), Check(check=trailing_period, error_fn=lambda d: "PR must not end in a tailing '.'"), - # TODO(driazati): enable this check once https://github.com/apache/tvm/issues/12637 is done - # Check( - # check=usernames, - # error_fn=lambda d: f"PR title must not tag anyone but found these usernames: {d}", - # ), + Check( + check=usernames, + error_fn=lambda d: f"PR title must not tag anyone but found these usernames: {d}", + ), ] body_checks = [ Check(check=non_empty, error_fn=lambda d: "PR must have a body but body was empty"), - # TODO(driazati): enable this check once https://github.com/apache/tvm/issues/12637 is done - # Check( - # check=usernames, - # error_fn=lambda d: f"PR body must not tag anyone but found these usernames: {d}", - # ), + Check( + check=usernames, + error_fn=lambda d: f"PR body must not tag anyone but found these usernames: {d}", + ), ] diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py index 4b8c5d9ad444..8c7c9f6bb409 100644 --- a/tests/python/ci/test_ci.py +++ b/tests/python/ci/test_ci.py @@ -1327,6 +1327,18 @@ def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expec expected="non_empty: FAILED", expected_code=1, ), + user_title=dict( + title="[something] a change @someon", + body="hello", + expected="usernames: FAILED: PR title must not tag", + expected_code=1, + ), + user_body=dict( + title="[something] a change", + body="hello\n\n cc @someone", + expected="usernames: FAILED: PR body must not tag", + expected_code=1, + ), ) def test_pr_linter(title, body, expected, expected_code): """ From d9f7cf3539bc9e94f7b5b2c343536388e1b7fd26 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Tue, 20 Sep 2022 09:48:23 -0700 Subject: [PATCH 214/704] Revert "[ci] Lint PR title/body for @ tags (#12840)" (#12848) This reverts commit ecd003c742da85d4945c7d02e9301e07ad413136. The check needs to ignore @ s in some cases, such as within code blocks. --- ci/scripts/check_pr.py | 18 ++++++++++-------- tests/python/ci/test_ci.py | 12 ------------ 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/ci/scripts/check_pr.py b/ci/scripts/check_pr.py index 8be5c0ee46a8..9af5ec5580a3 100755 --- a/ci/scripts/check_pr.py +++ b/ci/scripts/check_pr.py @@ -69,17 +69,19 @@ def trailing_period(s: str): title_checks = [ Check(check=non_empty, error_fn=lambda d: "PR must have a title but title was empty"), Check(check=trailing_period, error_fn=lambda d: "PR must not end in a tailing '.'"), - Check( - check=usernames, - error_fn=lambda d: f"PR title must not tag anyone but found these usernames: {d}", - ), + # TODO(driazati): enable this check once https://github.com/apache/tvm/issues/12637 is done + # Check( + # check=usernames, + # error_fn=lambda d: f"PR title must not tag anyone but found these usernames: {d}", + # ), ] body_checks = [ Check(check=non_empty, error_fn=lambda d: "PR must have a body but body was empty"), - Check( - check=usernames, - error_fn=lambda d: f"PR body must not tag anyone but found these usernames: {d}", - ), + # TODO(driazati): enable this check once https://github.com/apache/tvm/issues/12637 is done + # Check( + # check=usernames, + # error_fn=lambda d: f"PR body must not tag anyone but found these usernames: {d}", + # ), ] diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py index 8c7c9f6bb409..4b8c5d9ad444 100644 --- a/tests/python/ci/test_ci.py +++ b/tests/python/ci/test_ci.py @@ -1327,18 +1327,6 @@ def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expec expected="non_empty: FAILED", expected_code=1, ), - user_title=dict( - title="[something] a change @someon", - body="hello", - expected="usernames: FAILED: PR title must not tag", - expected_code=1, - ), - user_body=dict( - title="[something] a change", - body="hello\n\n cc @someone", - expected="usernames: FAILED: PR body must not tag", - expected_code=1, - ), ) def test_pr_linter(title, body, expected, expected_code): """ From 5dfa8da00ec658934f3fc0df8eb9f41a167e1545 Mon Sep 17 00:00:00 2001 From: Adam Straw Date: Tue, 20 Sep 2022 12:38:04 -0700 Subject: [PATCH 215/704] [Hexagon] 2-Stage Pipeline; Lower Async TIR primitives to Hexagon User DMA (#12785) * [Hexagon] 2-Stage Pipeline; Lower Async TIR primitives to HexagonUserDMA * save queue ID in `copy`, inspect in `wait` transform; add comments * improve testing; parameters for shape, scope, dtype * add log statements and adjust comments to clarify pass behavior * generalize use_async_copy for pass enable * use DLOG instead of LOG * trigger ci * trigger ci again --- include/tvm/tir/builtin.h | 10 + include/tvm/tir/transform.h | 5 + src/driver/driver_api.cc | 12 +- src/runtime/hexagon/hexagon_device_api.cc | 25 +++ src/tir/op/builtin.cc | 6 + src/tir/transforms/lower_async_dma.cc | 194 ++++++++++++++++++ src/tir/transforms/lower_tvm_builtin.cc | 30 +++ .../test_software_pipeline_async.py | 86 ++++++++ ...est_tir_transform_inject_ptx_async_copy.py | 4 +- ..._tir_transform_inject_software_pipeline.py | 2 +- 10 files changed, 367 insertions(+), 7 deletions(-) create mode 100644 src/tir/transforms/lower_async_dma.cc create mode 100644 tests/python/contrib/test_hexagon/test_software_pipeline_async.py diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index 12290a97c840..a1a97595bfd8 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -720,6 +720,16 @@ TVM_DLL const Op& texture2d_load(); */ TVM_DLL const Op& mem_copy(); +/*! + * \brief Initiate a non-blocking DMA copy from source to destination + */ +TVM_DLL const Op& dma_copy(); + +/*! + * \brief Wait until the number of DMAs in flight is less than or equal to some maximum + */ +TVM_DLL const Op& dma_wait(); + /*! * \brief Provide a true statement that can be used for simplifications * diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h index fd4261e4a4e3..a4caeee43604 100644 --- a/include/tvm/tir/transform.h +++ b/include/tvm/tir/transform.h @@ -485,6 +485,11 @@ TVM_DLL Pass TextureFlatten(); */ TVM_DLL Pass LowerVtcmAlloc(); +/*! + * \brief Lower Async TIR primitives to DMA copy and wait builtins + */ +TVM_DLL Pass LowerAsyncDMA(); + /*! * \brief Implements a Common Subexpression Elimination (CSE) for TIR * which introduces let-in bindings for duplicated sub-expressions. diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index e528686d967d..1a617dcd494d 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -50,7 +50,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_storage_rewrite", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("tir.is_entry_func", Bool); TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array>); TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool); -TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_ptx_async_copy", Bool); +TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_async_copy", Bool); using runtime::PackedFunc; using runtime::TVMArgs; @@ -225,6 +225,11 @@ Array CreatePassList(bool disable_loop_partition) { } // LowerVtcmAlloc must occur after any transformations that modify memory allocation locations pass_list.push_back(tir::transform::LowerVtcmAlloc()); + bool use_async_copy = pass_ctx->GetConfig("tir.use_async_copy", Bool(false)).value(); + + if (use_async_copy) { + pass_list.push_back(tir::transform::LowerAsyncDMA()); + } pass_list.push_back(tir::transform::UnrollLoop()); // Add user-defined phase-2 passes @@ -543,10 +548,9 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target) mixed_pass_list.push_back(tir::transform::InferFragment()); mixed_pass_list.push_back(tir::transform::LowerThreadAllreduce()); - bool use_ptx_async_copy = - pass_ctx->GetConfig("tir.use_ptx_async_copy", Bool(false)).value(); + bool use_async_copy = pass_ctx->GetConfig("tir.use_async_copy", Bool(false)).value(); - if (use_ptx_async_copy) { + if (use_async_copy) { mixed_pass_list.push_back(tir::transform::InjectPTXAsyncCopy()); } diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index 463d9799b082..84232a614428 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -33,6 +33,7 @@ #include "../workspace_pool.h" #include "hexagon_common.h" +#include "hexagon_user_dma.h" namespace tvm { namespace runtime { @@ -206,6 +207,30 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM *rv = static_cast(0); }); +TVM_REGISTER_GLOBAL("device_api.hexagon.dma_copy").set_body([](TVMArgs args, TVMRetValue* rv) { + int queue_id = args[0]; + ICHECK(queue_id == 0 && "Hexagon supports just a single asynchronous queue for DMA"); + void* dst = args[1]; + void* src = args[2]; + int size = args[3]; + ICHECK(size > 0); + + int ret = DMA_RETRY; + do { + ret = HexagonUserDMA::Get().Copy(dst, src, size); + } while (ret == DMA_RETRY); + *rv = static_cast(ret); +}); + +TVM_REGISTER_GLOBAL("device_api.hexagon.dma_wait").set_body([](TVMArgs args, TVMRetValue* rv) { + int queue_id = args[0]; + ICHECK(queue_id == 0 && "Hexagon supports just a single asynchronous queue for DMA"); + int inflight = args[1]; + ICHECK(inflight >= 0); + HexagonUserDMA::Get().Wait(inflight); + *rv = static_cast(0); +}); + TVM_REGISTER_GLOBAL("device_api.hexagon.alloc_nd").set_body([](TVMArgs args, TVMRetValue* rv) { int32_t device_type = args[0]; int32_t device_id = args[1]; diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc index 9642f8e39f39..1e2d790c76e1 100644 --- a/src/tir/op/builtin.cc +++ b/src/tir/op/builtin.cc @@ -288,6 +288,12 @@ TIR_DEFINE_BUILTIN_FUNC(texture2d_load) TIR_DEFINE_BUILTIN_FUNC(mem_copy).set_attr("TCallEffectKind", Integer(CallEffectKind::kOpaque)); +TIR_DEFINE_BUILTIN_FUNC(dma_copy).set_attr("TCallEffectKind", + Integer(CallEffectKind::kOpaque)); + +TIR_DEFINE_BUILTIN_FUNC(dma_wait).set_attr("TCallEffectKind", + Integer(CallEffectKind::kOpaque)); + TIR_DEFINE_BUILTIN_FUNC(assume) .set_attr("TCallEffectKind", Integer(CallEffectKind::kEmbedInfo)) .set_num_inputs(1); diff --git a/src/tir/transforms/lower_async_dma.cc b/src/tir/transforms/lower_async_dma.cc new file mode 100644 index 000000000000..78d363f67c02 --- /dev/null +++ b/src/tir/transforms/lower_async_dma.cc @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file lower_async_dma.cc + */ + +#include +#include +#include + +#include "ir_utils.h" + +namespace tvm { +namespace tir { + +class AsyncDMALowerer : public StmtExprMutator { + public: + AsyncDMALowerer() {} + + Stmt VisitStmt_(const AttrStmtNode* op) final { + // Convert this, for example: + // attr [0] "async_wait_queue_scope" = 0; + // attr [0] "async_wait_inflight_count" = 0; + // + // To this: + // @tir.dma_wait( + // 0, /* queue id */ + // 0, /* in flight count */ + // dtype=int32 + // ) + if (op->attr_key == tir::attr::async_wait_queue_scope) { + // get queue ID + auto queue_id_node = op->value.as(); + ICHECK(queue_id_node); + int queue_id = queue_id_node->value; + + // abort if we have not seen this queue ID in `copy` transform + if (queue_ids.find(queue_id) == queue_ids.end()) { + DLOG(INFO) << "AsyncDMALowerer exiting because the queue ID observed in the " + "`async_wait_queue_scope` transform has not been previously observed in the " + "`async_commit_queue_scope` transform"; + return StmtExprMutator::VisitStmt_(op); + } + + auto async_wait = op->body.as(); + if (!async_wait || async_wait->attr_key != tir::attr::async_wait_inflight_count) { + DLOG(INFO) << "AsyncDMALowerer exiting because the body of the `AttrStmtNode` with key " + "`async_wait_queue_scope` does not contain an `AttrStmtNode` with key " + "`async_wait_inflight_count`"; + return StmtExprMutator::VisitStmt_(op); + } + + auto call_dma_wait = + Evaluate(Call(DataType::Int(32), builtin::dma_wait(), {queue_id, async_wait->value})); + + // concatenate the call with the body and return + return SeqStmt({call_dma_wait, async_wait->body}); + + // Convert this, for example: + // attr [0] "async_commit_queue_scope" = 0; + // attr [0] "async_scope" = 1; + // for (ax0: int32, 0, 128) { + // A_global[ax0] = A[ax0] + // } + // + // To this: + // @tir.dma_copy( + // 0, /* queue id */ + // @tir.address_of(A_global[0], dtype=handle), + // @tir.address_of(A[0], dtype=handle), + // 128, /* size */ + // dtype=int32 + // ) + } else if (op->attr_key == tir::attr::async_commit_queue_scope) { + // get queue ID + auto queue_id_node = op->value.as(); + ICHECK(queue_id_node); + int queue_id = queue_id_node->value; + + // save queue ID for inspection in `wait` transform + queue_ids.insert(queue_id); + + // walk the graph to verify this is a mem copy ... + // 1) async_commit_queue_scope contains async_scope + auto async_scope = op->body.as(); + if (!async_scope || async_scope->attr_key != tir::attr::async_scope) { + DLOG(INFO) << "AsyncDMALowerer exiting because the body of the `AttrStmtNode` with key " + "`async_commit_queue_scope` does not contain an `AttrStmtNode` with key " + "`async_scope`"; + return StmtExprMutator::VisitStmt_(op); + } + + // 2) async_scope contains single for loop + auto for_loop = async_scope->body.as(); + if (!for_loop) { + DLOG(INFO) << "AsyncDMALowerer exiting because the body of the `AttrStmtNode` with key " + "`async_scope` does not contain a single `ForNode`"; + return StmtExprMutator::VisitStmt_(op); + } + + // 3) for loop contains buffer store with single index + auto bufferstorenode = for_loop->body.as(); + if (!bufferstorenode || bufferstorenode->indices.size() != 1) { + DLOG(INFO) + << "AsyncDMALowerer exiting because the body of the `ForNode` does not contain a " + "single `BufferStoreNode` with a single index variable"; + return StmtExprMutator::VisitStmt_(op); + } + + // 4) buffer store value is a buffer load with single index + auto bufferloadnode = bufferstorenode->value.as(); + if (!bufferloadnode || bufferloadnode->indices.size() != 1) { + DLOG(INFO) << "AsyncDMALowerer exiting because the value of the `BufferStoreNode` is not a " + "single `BufferLoadNode` with a single index variable"; + return StmtExprMutator::VisitStmt_(op); + } + + // get store buffer; assert it exists and is contiguous given it uses a single index + auto bufferstore = bufferstorenode->buffer.as(); + ICHECK(bufferstore && bufferstore->strides.empty()); + + // get load buffer; assert it exists and is contiguous given it uses a single index + auto bufferload = bufferloadnode->buffer.as(); + ICHECK(bufferload && bufferload->strides.empty()); + + // we will be replacing the entire for loop including its index + // with a DMA copy instrinsic that spans the entire index space of the for loop + // so we will need to replace the for loop index with value zero in the buffer indices + // thus we eliminate the index from the expression so the DMA copy receives the buffer range + // base address + Map loop_var_remap = {{for_loop->loop_var, IntImm(DataType::Int(32), 0)}}; + + // map loop variable to zero for the store index & simplify + Array store_index = bufferstorenode->indices; + store_index.MutateByApply([&](PrimExpr expr) { + arith::Analyzer analyzer; + return analyzer.Simplify(Substitute(std::move(expr), loop_var_remap)); + }); + + // map loop variable to zero for the load index & simplify + Array load_index = bufferloadnode->indices; + load_index.MutateByApply([&](PrimExpr expr) { + arith::Analyzer analyzer; + return analyzer.Simplify(Substitute(std::move(expr), loop_var_remap)); + }); + + return Evaluate(Call(DataType::Int(32), builtin::dma_copy(), + {queue_id, + Call(DataType::Handle(), builtin::address_of(), + {BufferLoad(bufferstorenode->buffer, store_index)}), + Call(DataType::Handle(), builtin::address_of(), + {BufferLoad(bufferloadnode->buffer, load_index)}), + for_loop->extent * bufferloadnode->dtype.bytes()})); + } + return StmtExprMutator::VisitStmt_(op); + } + + private: + std::set queue_ids; +}; + +namespace transform { + +Pass LowerAsyncDMA() { + auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) { + auto fptr = f.CopyOnWrite(); + fptr->body = AsyncDMALowerer()(std::move(fptr->body)); + return f; + }; + return CreatePrimFuncPass(pass_func, 0, "tir.LowerAsyncDMA", {}); +} + +TVM_REGISTER_GLOBAL("tir.transform.LowerAsyncDMA").set_body_typed(LowerAsyncDMA); +} // namespace transform + +} // namespace tir +} // namespace tvm diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 9d0087cc7a0b..f79682ef7ecc 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -317,6 +317,10 @@ class BuiltinLower : public StmtExprMutator { return make_zero(op->dtype); } else if (op->op.same_as(builtin::mem_copy())) { return MakeMemCopy(op); + } else if (op->op.same_as(builtin::dma_copy())) { + return MakeDMACopy(op); + } else if (op->op.same_as(builtin::dma_wait())) { + return MakeDMAWait(op); } else { return StmtExprMutator::VisitExpr_(op); } @@ -335,6 +339,32 @@ class BuiltinLower : public StmtExprMutator { return VisitExpr(call_packed); } + PrimExpr MakeDMACopy(const CallNode* op) { + PrimExpr queue_id = op->args[0]; + PrimExpr dst = op->args[1]; + PrimExpr src = op->args[2]; + PrimExpr size = op->args[3]; + + std::string fdevapi_prefix = + "device_api." + std::string(runtime::DeviceName(device_type_.as()->value)); + + Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".dma_copy"), queue_id, dst, src, size}); + return VisitExpr(call_packed); + } + + PrimExpr MakeDMAWait(const CallNode* op) { + PrimExpr queue_id = op->args[0]; + PrimExpr inflight = op->args[1]; + + std::string fdevapi_prefix = + "device_api." + std::string(runtime::DeviceName(device_type_.as()->value)); + + Call call_packed = Call(DataType::Int(32), builtin::tvm_call_packed(), + {StringImm(fdevapi_prefix + ".dma_wait"), queue_id, inflight}); + return VisitExpr(call_packed); + } + // call shape PrimExpr MakeShape(const CallNode* op) { // if args.size() == 0, it represents a scalar shape () diff --git a/tests/python/contrib/test_hexagon/test_software_pipeline_async.py b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py new file mode 100644 index 000000000000..6bcca90ec9d3 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_software_pipeline_async.py @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys +import pytest +import numpy as np + +import tvm +from tvm import tir +from tvm.contrib.hexagon.session import Session +from tvm.script import tir as T + +outer = tvm.testing.parameter(8, 16) +inner = tvm.testing.parameter(64, 128) +scope = tvm.testing.parameter("global", "global.vtcm") +dtype = tvm.testing.parameter("uint8", "float16") + + +@tvm.testing.fixture +def compute(outer, inner, dtype): + @T.prim_func + def plus_one_primfunc(A: T.Buffer[(outer, inner), dtype], B: T.Buffer[(outer, inner), dtype]): + for i in T.serial(outer): + for j in T.serial(inner): + with T.block("compute"): + with T.block(): + B[i, j] = A[i, j] + T.cast(1, dtype) + + def plus_one_ref(a): + return a + 1 + + return plus_one_primfunc, plus_one_ref + + +@tvm.testing.requires_hexagon +def test_software_pipeline_with_cache_read(hexagon_launcher, compute, outer, inner, dtype, scope): + sch = tir.Schedule(compute[0]) + root = sch.get_block("root") + compute_block = sch.get_block("compute") + cache_read_block = sch.cache_read(compute_block, 0, scope) + + i, _ = sch.get_loops(compute_block) + sch.compute_at(cache_read_block, i) + sch.annotate(i, "software_pipeline_stage", [0, 1]) + sch.annotate(i, "software_pipeline_order", [0, 1]) + sch.annotate(i, "software_pipeline_async_stages", [0]) + + a_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype) + b_np = np.random.uniform(low=0, high=128, size=(outer, inner)).astype(dtype) + ref = compute[1](a_np) + + target_hexagon = tvm.target.hexagon("v68", link_params=True) + with tvm.transform.PassContext(config={"tir.use_async_copy": 1}): + func = tvm.build( + sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon) + ) + + with hexagon_launcher.start_session() as hexagon_session: + dev = hexagon_session.device + a = tvm.nd.array(a_np, device=dev) + b = tvm.nd.array(b_np, device=dev) + mod = hexagon_session.load_module(func) + mod(a, b) + + if "int" in dtype: + np.testing.assert_equal(b.numpy(), ref) + else: + np.testing.assert_allclose(b.numpy(), ref, rtol=1e-3, atol=1e-3) + + +if __name__ == "__main__": + sys.exit(pytest.main(sys.argv)) diff --git a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py index 1a906b2fb66e..7062d5129713 100644 --- a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py +++ b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py @@ -138,7 +138,7 @@ def test_inject_async_copy(): if not tvm.testing.is_ampere_or_newer(): continue - with tvm.transform.PassContext(config={"tir.use_ptx_async_copy": 1}): + with tvm.transform.PassContext(config={"tir.use_async_copy": 1}): mod = tvm.build(tvm.IRModule.from_expr(f), target="cuda") A_np = np.random.rand(32, 128).astype(dtype) @@ -166,7 +166,7 @@ def test_inject_async_copy_shared_dyn(): if not tvm.testing.is_ampere_or_newer(): return - with tvm.transform.PassContext(config={"tir.use_ptx_async_copy": 1}): + with tvm.transform.PassContext(config={"tir.use_async_copy": 1}): mod = tvm.build(tvm.IRModule.from_expr(f), target="cuda") A_np = np.random.rand(32, 128).astype("float16") diff --git a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py index edaeb7c9b639..49255e0f2094 100644 --- a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py +++ b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py @@ -1390,7 +1390,7 @@ def index_map(i, j): def build_and_run(sch): if tvm.testing.is_ampere_or_newer(): - with tvm.transform.PassContext(config={"tir.use_ptx_async_copy": 1}): + with tvm.transform.PassContext(config={"tir.use_async_copy": 1}): f = tvm.build(sch.mod["main"], target="cuda") dev = tvm.device("cuda", 0) From 534378b935aa08b77e7529ec183133a24f121ae4 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Tue, 20 Sep 2022 15:49:46 -0500 Subject: [PATCH 216/704] [Containers] Add Array::Map (#12692) * [Containers] Add Array::Map Previously, an in-place mutation could be applied to an array using `Array::MutateByApply`, but this couldn't be used for transformations that return a new array, or for transformations that return a new type. The commit adds `Array::Map`, which can map to any `ObjectRef` subclass. For mappings that return the same type, this is done by delegating to `Array::MutateByApply`, to take advantage of the same copy-on-write behavior. * [Refactor] Use Array::Map where possible With the new `Array::Map` functionality, many places that previously used explicit loops or `tvm::tir::MutateArray` can be cleaned. * Merge the Map and MutateInPlace implementations * Fix off-by-one error in MapHelper * Updated with unit tests for Array::Map conversions * Improved comments explaining the copy-on-write in MapHelper --- include/tvm/runtime/container/array.h | 198 ++++++++++++++---- src/ir/type_functor.cc | 9 +- src/te/operation/create_primfunc.cc | 2 +- src/tir/analysis/device_constraint_utils.cc | 5 +- src/tir/ir/buffer.cc | 4 +- src/tir/ir/expr.cc | 3 +- src/tir/ir/expr_functor.cc | 14 +- src/tir/ir/functor_common.h | 3 +- src/tir/ir/index_map.cc | 5 +- src/tir/ir/specialize.cc | 19 +- src/tir/ir/stmt_functor.cc | 3 +- .../schedule/primitive/decompose_padding.cc | 15 +- src/tir/schedule/transform.cc | 8 +- src/tir/transforms/inject_virtual_thread.cc | 4 +- src/tir/transforms/lower_match_buffer.cc | 8 +- src/tir/transforms/renew_defs.cc | 37 ++-- src/tir/transforms/vectorize_loop.cc | 6 +- tests/cpp/container_test.cc | 135 ++++++++++++ 18 files changed, 353 insertions(+), 125 deletions(-) diff --git a/include/tvm/runtime/container/array.h b/include/tvm/runtime/container/array.h index 26f4e545deb7..11bacb18e92c 100644 --- a/include/tvm/runtime/container/array.h +++ b/include/tvm/runtime/container/array.h @@ -26,10 +26,12 @@ #include #include +#include #include #include #include "./base.h" +#include "./optional.h" namespace tvm { namespace runtime { @@ -248,6 +250,23 @@ class ArrayNode : public Object, public InplaceArrayBase { friend ObjectPtr make_object<>(); }; +/*! \brief Helper struct for type-checking + * + * is_valid_iterator::value will be true if IterType can + * be dereferenced into a type that can be stored in an Array, and + * false otherwise. + */ +template +struct is_valid_iterator + : std::bool_constant())>>>> {}; + +template +struct is_valid_iterator, IterType> : is_valid_iterator {}; + +template +inline constexpr bool is_valid_iterator_v = is_valid_iterator::value; + /*! * \brief Array, container representing a contiguous sequence of ObjectRefs. * @@ -574,54 +593,39 @@ class Array : public ObjectRef { /*! \return The underlying ArrayNode */ ArrayNode* GetArrayNode() const { return static_cast(data_.get()); } + /*! + * \brief Helper function to apply a map function onto the array. + * + * \param fmap The transformation function T -> U. + * + * \tparam F The type of the mutation function. + * + * \tparam U The type of the returned array, inferred from the + * return type of F. If overridden by the user, must be something + * that is convertible from the return type of F. + * + * \note This function performs copy on write optimization. If + * `fmap` returns an object of type `T`, and all elements of the + * array are mapped to themselves, then the returned array will be + * the same as the original, and reference counts of the elements in + * the array will not be incremented. + * + * \return The transformed array. + */ + template > + Array Map(F fmap) const { + return Array(MapHelper(data_, fmap)); + } + /*! * \brief Helper function to apply fmutate to mutate an array. * \param fmutate The transformation function T -> T. * \tparam F the type of the mutation function. * \note This function performs copy on write optimization. */ - template + template >>> void MutateByApply(F fmutate) { - if (data_ == nullptr) { - return; - } - struct StackFrame { - ArrayNode* p; - ObjectRef* itr; - int64_t i; - int64_t size; - }; - std::unique_ptr s = std::make_unique(); - s->p = GetArrayNode(); - s->itr = s->p->MutableBegin(); - s->i = 0; - s->size = s->p->size_; - if (!data_.unique()) { - // Loop invariant: keeps iterating when - // 1) data is not unique - // 2) no elements are actually mutated yet - for (; s->i < s->size; ++s->i, ++s->itr) { - T new_elem = fmutate(DowncastNoCheck(*s->itr)); - // do nothing when there is no mutation - if (new_elem.same_as(*s->itr)) { - continue; - } - // loop invariant breaks when the first real mutation happens - // we copy the elements into a new unique array - ObjectPtr copy = ArrayNode::CopyFrom(s->p->capacity_, s->p); - s->itr = copy->MutableBegin() + (s->i++); - *s->itr++ = std::move(new_elem); - data_ = std::move(copy); - // make sure `data_` is unique and break - break; - } - } - // when execution comes to this line, it is guaranteed that either - // 1) i == size - // or 2) data_.unique() is true - for (; s->i < s->size; ++s->i, ++s->itr) { - *s->itr = std::move(fmutate(std::move(DowncastNoCheck(std::move(*s->itr))))); - } + data_ = MapHelper(std::move(data_), fmutate); } /*! @@ -706,6 +710,118 @@ class Array : public ObjectRef { } return static_cast(data_.get()); } + + /*! \brief Helper method for mutate/map + * + * A helper function used internally by both `Array::Map` and + * `Array::MutateInPlace`. Given an array of data, apply the + * mapping function to each element, returning the collected array. + * Applies both mutate-in-place and copy-on-write optimizations, if + * possible. + * + * \param data A pointer to the ArrayNode containing input data. + * Passed by value to allow for mutate-in-place optimizations. + * + * \param fmap The mapping function + * + * \tparam F The type of the mutation function. + * + * \tparam U The output type of the mutation function. Inferred + * from the callable type given. Must inherit from ObjectRef. + * + * \return The mapped array. Depending on whether mutate-in-place + * or copy-on-write optimizations were applicable, may be the same + * underlying array as the `data` parameter. + */ + template > + static ObjectPtr MapHelper(ObjectPtr data, F fmap) { + if (data == nullptr) { + return nullptr; + } + + ICHECK(data->IsInstance()); + + constexpr bool is_same_output_type = std::is_same_v; + + if constexpr (is_same_output_type) { + if (data.unique()) { + // Mutate-in-place path. Only allowed if the output type U is + // the same as type T, we have a mutable this*, and there are + // no other shared copies of the array. + auto arr = static_cast(data.get()); + for (auto it = arr->MutableBegin(); it != arr->MutableEnd(); it++) { + T mapped = fmap(DowncastNoCheck(std::move(*it))); + *it = std::move(mapped); + } + return data; + } + } + + constexpr bool compatible_types = is_valid_iterator_v || is_valid_iterator_v; + + ObjectPtr output = nullptr; + auto arr = static_cast(data.get()); + + auto it = arr->begin(); + if constexpr (compatible_types) { + // Copy-on-write path, if the output Array might be + // represented by the same underlying array as the existing + // Array. Typically, this is for functions that map `T` to + // `T`, but can also apply to functions that map `T` to + // `Optional`, or that map `T` to a subclass or superclass of + // `T`. + bool all_identical = true; + for (; it != arr->end(); it++) { + U mapped = fmap(DowncastNoCheck(*it)); + if (!mapped.same_as(*it)) { + // At least one mapped element is different than the + // original. Therefore, prepare the output array, + // consisting of any previous elements that had mapped to + // themselves (if any), and the element that didn't map to + // itself. + all_identical = false; + output = ArrayNode::CreateRepeated(arr->size(), U()); + output->InitRange(0, arr->begin(), it); + output->SetItem(it - arr->begin(), std::move(mapped)); + it++; + break; + } + } + if (all_identical) { + return data; + } + } else { + // Path for incompatible types. The constexpr check for + // compatible types isn't strictly necessary, as the first + // mapped.same_as(*it) would return false, but we might as well + // avoid it altogether. + output = ArrayNode::CreateRepeated(arr->size(), U()); + } + + // Normal path for incompatible types, or post-copy path for + // copy-on-write instances. + // + // If the types are incompatible, then at this point `output` is + // empty, and `it` points to the first element of the input. + // + // If the types were compatible, then at this point `output` + // contains zero or more elements that mapped to themselves + // followed by the first element that does not map to itself, and + // `it` points to the element just after the first element that + // does not map to itself. Because at least one element has been + // changed, we no longer have the opportunity to avoid a copy, so + // we don't need to check the result. + // + // In both cases, `it` points to the next element to be processed, + // so we can either start or resume the iteration from that point, + // with no further checks on the result. + for (; it != arr->end(); it++) { + U mapped = fmap(DowncastNoCheck(*it)); + output->SetItem(it - arr->begin(), std::move(mapped)); + } + + return output; + } }; /*! diff --git a/src/ir/type_functor.cc b/src/ir/type_functor.cc index 51d5d3778c10..36838b62aabc 100644 --- a/src/ir/type_functor.cc +++ b/src/ir/type_functor.cc @@ -97,14 +97,7 @@ Type TypeMutator::VisitType(const Type& t) { Array TypeMutator::MutateArray(Array arr) { // The array will do copy on write // If no changes are made, the original array will be returned. - for (size_t i = 0; i < arr.size(); ++i) { - Type ty = arr[i]; - Type new_ty = VisitType(ty); - if (!ty.same_as(new_ty)) { - arr.Set(i, new_ty); - } - } - return arr; + return arr.Map([this](const Type& ty) { return VisitType(ty); }); } Type TypeMutator::VisitType_(const TypeVarNode* op) { return GetRef(op); } diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc index 4c1358f42519..fb325684e65b 100644 --- a/src/te/operation/create_primfunc.cc +++ b/src/te/operation/create_primfunc.cc @@ -256,7 +256,7 @@ BlockRealize GenerateBlockFromTensors(const te::ComputeOp& compute_op, // TensorIR will not allow Tensor data structure if (value->IsInstance()) { const auto array_value = Downcast>(value); - annotations.Set(key, MutateArray(array_value, mutate_attr)); + annotations.Set(key, array_value.Map(mutate_attr)); } else { annotations.Set(key, mutate_attr(value)); } diff --git a/src/tir/analysis/device_constraint_utils.cc b/src/tir/analysis/device_constraint_utils.cc index 1309681513a9..32b59ce54b69 100644 --- a/src/tir/analysis/device_constraint_utils.cc +++ b/src/tir/analysis/device_constraint_utils.cc @@ -393,9 +393,8 @@ class ApplyDeviceConstraintsMutator : public StmtExprMutator { } template - Array VisitItems(Array items) { - items.MutateByApply([this](const T& item) { return VisitItem(item.get()); }); // copy-on-write - return items; + Array VisitItems(const Array& items) { + return items.Map([this](T item) -> T { return VisitItem(item.get()); }); } Stmt VisitStmt_(const BlockNode* block_node) final { diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc index cae4109a6026..0dfda954b818 100644 --- a/src/tir/ir/buffer.cc +++ b/src/tir/ir/buffer.cc @@ -461,8 +461,8 @@ Buffer Buffer::MakeSlice(Array begins, Array extents) const ICHECK(n != nullptr); arith::Analyzer ana; begins = SimplifyArray(&ana, begins); - Array elem_offset = n->ElemOffset(begins); - elem_offset.MutateByApply([&](const PrimExpr& expr) { return ana.Simplify(expr); }); + Array elem_offset = + n->ElemOffset(begins).Map([&](const PrimExpr& expr) { return ana.Simplify(expr); }); Array strides = n->strides; if (strides.size() == 0) { diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc index 59db4ea410fd..daae7eaf68f5 100644 --- a/src/tir/ir/expr.cc +++ b/src/tir/ir/expr.cc @@ -994,8 +994,7 @@ Array CommReducerNode::operator()(Array a, Array b value_map.Set(lhs[i], a[i]); value_map.Set(rhs[i], b[i]); } - auto ret = this->result; - ret.MutateByApply([&value_map](const PrimExpr& e) { return Substitute(e, value_map); }); + auto ret = this->result.Map([&value_map](const PrimExpr& e) { return Substitute(e, value_map); }); return ret; } diff --git a/src/tir/ir/expr_functor.cc b/src/tir/ir/expr_functor.cc index c8dc84695b4f..da02e0316f48 100644 --- a/src/tir/ir/expr_functor.cc +++ b/src/tir/ir/expr_functor.cc @@ -132,7 +132,7 @@ PrimExpr ExprMutator::VisitExpr_(const LoadNode* op) { PrimExpr ExprMutator::VisitExpr_(const BufferLoadNode* op) { auto fmutate = [this](const PrimExpr& e) { return this->VisitExpr(e); }; - Array indices = MutateArray(op->indices, fmutate); + Array indices = op->indices.Map(fmutate); if (indices.same_as(op->indices)) { return GetRef(op); } else { @@ -142,7 +142,7 @@ PrimExpr ExprMutator::VisitExpr_(const BufferLoadNode* op) { PrimExpr ExprMutator::VisitExpr_(const ProducerLoadNode* op) { auto fmutate = [this](const PrimExpr& e) { return this->VisitExpr(e); }; - Array indices = MutateArray(op->indices, fmutate); + Array indices = op->indices.Map(fmutate); if (indices.same_as(op->indices)) { return GetRef(op); } else { @@ -162,7 +162,7 @@ PrimExpr ExprMutator::VisitExpr_(const LetNode* op) { PrimExpr ExprMutator::VisitExpr_(const CallNode* op) { auto fmutate = [this](const PrimExpr& e) { return this->VisitExpr(e); }; - Array args = MutateArray(op->args, fmutate); + Array args = op->args.Map(fmutate); if (args.same_as(op->args)) { return GetRef(op); @@ -218,11 +218,11 @@ PrimExpr ExprMutator::VisitExpr_(const ReduceNode* op) { return IterVar(Range::FromMinExtent(min, extent), v->var, v->iter_type, v->thread_tag); } }; - Array axis = MutateArray(op->axis, fitervar); + Array axis = op->axis.Map(fitervar); auto fexpr = [this](const PrimExpr& e) { return this->VisitExpr(e); }; - Array source = MutateArray(op->source, fexpr); - Array init = MutateArray(op->init, fexpr); + Array source = op->source.Map(fexpr); + Array init = op->init.Map(fexpr); PrimExpr condition = this->VisitExpr(op->condition); @@ -285,7 +285,7 @@ PrimExpr ExprMutator::VisitExpr_(const BroadcastNode* op) { PrimExpr ExprMutator::VisitExpr_(const ShuffleNode* op) { auto fexpr = [this](const PrimExpr& e) { return this->VisitExpr(e); }; - auto vectors = MutateArray(op->vectors, fexpr); + auto vectors = op->vectors.Map(fexpr); if (vectors.same_as(op->vectors)) { return GetRef(op); } else { diff --git a/src/tir/ir/functor_common.h b/src/tir/ir/functor_common.h index 8b5a361a37c6..b9bb43ca6ba6 100644 --- a/src/tir/ir/functor_common.h +++ b/src/tir/ir/functor_common.h @@ -38,8 +38,7 @@ inline void VisitArray(const Array& arr, F fvisit) { template inline Array MutateArray(Array arr, F fmutate) { - arr.MutateByApply(fmutate); - return arr; + return arr.Map(fmutate); } } // namespace tir diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc index 64c5d5d5ddde..2ffc5079246b 100644 --- a/src/tir/ir/index_map.cc +++ b/src/tir/ir/index_map.cc @@ -185,9 +185,8 @@ Array IndexMapNode::MapIndices(const Array& indices, analyzer = &local_analyzer; } - Array output = final_indices; - output.MutateByApply( - [&](const PrimExpr& index) { return analyzer->Simplify(Substitute(index, vmap)); }); + Array output = final_indices.Map( + [&](PrimExpr index) { return analyzer->Simplify(Substitute(std::move(index), vmap)); }); return output; } diff --git a/src/tir/ir/specialize.cc b/src/tir/ir/specialize.cc index 520e3ee03c92..ea68015bc73b 100644 --- a/src/tir/ir/specialize.cc +++ b/src/tir/ir/specialize.cc @@ -115,8 +115,7 @@ class PrimFuncSpecializer : public StmtExprMutator { private: Stmt VisitStmt_(const BlockNode* op) final { // Step.0. Define buffer mappings which is allocated inside the block - Array alloc_buffers = MutateArray( - op->alloc_buffers, + Array alloc_buffers = op->alloc_buffers.Map( std::bind(&PrimFuncSpecializer::MutateAllocBuffer, this, std::placeholders::_1)); // Step.1. Recursively visit block body @@ -124,11 +123,9 @@ class PrimFuncSpecializer : public StmtExprMutator { op = stmt.as(); ICHECK(op != nullptr); - Array reads = MutateArray( - op->reads, + Array reads = op->reads.Map( std::bind(&PrimFuncSpecializer::MutateBufferRegion, this, std::placeholders::_1)); - Array writes = MutateArray( - op->writes, + Array writes = op->writes.Map( std::bind(&PrimFuncSpecializer::MutateBufferRegion, this, std::placeholders::_1)); if (alloc_buffers.same_as(op->alloc_buffers) && reads.same_as(op->reads)) { @@ -200,10 +197,9 @@ class PrimFuncSpecializer : public StmtExprMutator { private: Buffer MutateBuffer(const Buffer& buffer) { - Array shape = - MutateArray(buffer->shape, [this](const PrimExpr& e) { return VisitExpr(e); }); + Array shape = buffer->shape.Map([this](const PrimExpr& e) { return VisitExpr(e); }); Array strides = - MutateArray(buffer->strides, [this](const PrimExpr& e) { return VisitExpr(e); }); + buffer->strides.Map([this](const PrimExpr& e) { return VisitExpr(e); }); PrimExpr elem_offset = VisitExpr(buffer->elem_offset); @@ -242,9 +238,8 @@ class PrimFuncSpecializer : public StmtExprMutator { BufferRegion MutateBufferRegion(const BufferRegion& buffer_region) { auto it = buffer_map_.find(buffer_region->buffer); - Array region = - MutateArray(buffer_region->region, - std::bind(&PrimFuncSpecializer::MutateRange, this, std::placeholders::_1)); + Array region = buffer_region->region.Map( + std::bind(&PrimFuncSpecializer::MutateRange, this, std::placeholders::_1)); if (it == buffer_map_.end() && region.same_as(buffer_region->region)) { return buffer_region; } else { diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc index c75eb52f9296..c2e2489cba92 100644 --- a/src/tir/ir/stmt_functor.cc +++ b/src/tir/ir/stmt_functor.cc @@ -183,9 +183,8 @@ class StmtMutator::Internal { return arr; } else { bool allow_cow = false; - Array copy = arr; std::swap(allow_cow, self->allow_copy_on_write_); - copy.MutateByApply(fmutate); + Array copy = arr.Map(fmutate); std::swap(allow_cow, self->allow_copy_on_write_); return copy; } diff --git a/src/tir/schedule/primitive/decompose_padding.cc b/src/tir/schedule/primitive/decompose_padding.cc index 93fb88e66619..c41760876722 100644 --- a/src/tir/schedule/primitive/decompose_padding.cc +++ b/src/tir/schedule/primitive/decompose_padding.cc @@ -212,16 +212,15 @@ static std::pair CreateConstBlock(const BlockRealizeNode* re // create new write region ICHECK_EQ(block->writes.size(), 1U); - BufferRegion write_region = - BufferRegion(block->writes[0]->buffer, - MutateArray(block->writes[0]->region, [rewrite_expr](const Range& r) { - return Range::FromMinExtent(rewrite_expr(r->min), rewrite_expr(r->extent)); - })); + BufferRegion write_region = BufferRegion( + block->writes[0]->buffer, block->writes[0]->region.Map([rewrite_expr](const Range& r) { + return Range::FromMinExtent(rewrite_expr(r->min), rewrite_expr(r->extent)); + })); // create block to fill const pad values BufferStore store = Downcast(block->body); store.CopyOnWrite()->value = info.pad_value; - store.CopyOnWrite()->indices = MutateArray(store->indices, rewrite_expr); + store.CopyOnWrite()->indices = store->indices.Map(rewrite_expr); Block new_block(/*iter_vars=*/new_iter_vars, /*reads=*/{}, /*writes=*/{write_region}, /*name_hint=*/block->name_hint + "_pad_const", /*body=*/std::move(store)); @@ -307,7 +306,7 @@ static std::pair CreateInBoundBlock(const BlockRealizeNode* return analyzer->Simplify(Substitute(e, repl_dict)); }; auto rewrite_region = [rewrite_expr](const Region& region) { - return MutateArray(region, [rewrite_expr](const Range& r) { + return region.Map([rewrite_expr](const Range& r) { return Range::FromMinExtent(rewrite_expr(r->min), rewrite_expr(r->extent)); }); }; @@ -324,7 +323,7 @@ static std::pair CreateInBoundBlock(const BlockRealizeNode* // create new block realize node BufferStore store = Downcast(block->body); store.CopyOnWrite()->value = rewrite_expr(info.in_bound_value); - store.CopyOnWrite()->indices = MutateArray(store->indices, rewrite_expr); + store.CopyOnWrite()->indices = store->indices.Map(rewrite_expr); Block new_block(/*iter_vars=*/new_iter_vars, /*reads=*/reads, /*writes=*/writes, /*name_hint=*/block->name_hint, /*body=*/std::move(store)); PrimExpr new_predicate = rewrite_expr(info.in_bound_predicate); diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc index b00005c58061..d99cc199fe5f 100644 --- a/src/tir/schedule/transform.cc +++ b/src/tir/schedule/transform.cc @@ -177,12 +177,12 @@ Stmt ReplaceBufferMutator::VisitStmt_(const BlockNode* block) { }; // Step 1. Mutate `match_buffers`. If an old buffer appears as a source of MatchBufferRegion, - Array match_buffers = MutateArray(block->match_buffers, f_mutate_match_buffer); + Array match_buffers = block->match_buffers.Map(f_mutate_match_buffer); // Step 2. Mutate the read/write region. - Array reads = MutateArray(block->reads, f_mutate_read_write_region); - Array writes = MutateArray(block->writes, f_mutate_read_write_region); + Array reads = block->reads.Map(f_mutate_read_write_region); + Array writes = block->writes.Map(f_mutate_read_write_region); // Step 3. Mutate `alloc_buffers` for the old buffer allocated in this block. - Array alloc_buffers = MutateArray(block->alloc_buffers, f_mutate_alloc_buffers); + Array alloc_buffers = block->alloc_buffers.Map(f_mutate_alloc_buffers); // Step 4. Recursively mutate the block. Block mutated_block = Downcast(StmtMutator::VisitStmt_(block)); diff --git a/src/tir/transforms/inject_virtual_thread.cc b/src/tir/transforms/inject_virtual_thread.cc index 455140c75c13..f49b6b2ace8e 100644 --- a/src/tir/transforms/inject_virtual_thread.cc +++ b/src/tir/transforms/inject_virtual_thread.cc @@ -400,8 +400,8 @@ class VTInjector : public arith::IRMutatorWithAnalyzer { PrimExpr condition = this->VisitExpr(op->condition); - Array extents = op->extents; - extents.MutateByApply([this](const PrimExpr& extent) { return this->VisitExpr(extent); }); + Array extents = + op->extents.Map([this](const PrimExpr& extent) { return this->VisitExpr(extent); }); if (visit_touched_var_ && !vt_loop_injected_) { return InjectVTLoop(GetRef(op), true); diff --git a/src/tir/transforms/lower_match_buffer.cc b/src/tir/transforms/lower_match_buffer.cc index 5bde5cb90e2b..9b915da6290b 100644 --- a/src/tir/transforms/lower_match_buffer.cc +++ b/src/tir/transforms/lower_match_buffer.cc @@ -51,10 +51,10 @@ class MatchBufferLower : public StmtExprMutator { Stmt stmt = StmtExprMutator ::VisitStmt_(op); op = stmt.as(); ICHECK(op != nullptr); - Array reads = MutateArray( - op->reads, std::bind(&MatchBufferLower::VisitBufferRegion, this, std::placeholders::_1)); - Array writes = MutateArray( - op->writes, std::bind(&MatchBufferLower::VisitBufferRegion, this, std::placeholders::_1)); + Array reads = + op->reads.Map(std::bind(&MatchBufferLower::VisitBufferRegion, this, std::placeholders::_1)); + Array writes = op->writes.Map( + std::bind(&MatchBufferLower::VisitBufferRegion, this, std::placeholders::_1)); if (reads.same_as(op->reads) && writes.same_as(op->writes) && op->match_buffers.empty()) { return stmt; diff --git a/src/tir/transforms/renew_defs.cc b/src/tir/transforms/renew_defs.cc index c717dc9b98f2..a185916a9a4c 100644 --- a/src/tir/transforms/renew_defs.cc +++ b/src/tir/transforms/renew_defs.cc @@ -96,18 +96,16 @@ class RenewDefMutator : public StmtExprMutator { Stmt VisitStmt_(const BlockNode* op) final { // Step 0. Re-define Itervars - Array iter_vars = MutateArray( - op->iter_vars, std::bind(&RenewDefMutator::VisitIterVar, this, std::placeholders::_1)); + Array iter_vars = + op->iter_vars.Map(std::bind(&RenewDefMutator::VisitIterVar, this, std::placeholders::_1)); // Step 1. Re-define buffers allocate under the block - Array alloc_buffers = MutateArray( - op->alloc_buffers, + Array alloc_buffers = op->alloc_buffers.Map( std::bind(&RenewDefMutator::VisitBuffer, this, std::placeholders::_1, /*define=*/true)); // Step 2. Re-define match_buffers - Array match_buffers = - MutateArray(op->match_buffers, - std::bind(&RenewDefMutator::VisitMatchBuffer, this, std::placeholders::_1)); + Array match_buffers = op->match_buffers.Map( + std::bind(&RenewDefMutator::VisitMatchBuffer, this, std::placeholders::_1)); // Step 3. Visit body Stmt stmt = StmtExprMutator::VisitStmt_(op); @@ -115,10 +113,10 @@ class RenewDefMutator : public StmtExprMutator { ICHECK(op); // Step 4. Revisit access region - Array reads = MutateArray( - op->reads, std::bind(&RenewDefMutator::VisitBufferRegion, this, std::placeholders::_1)); - Array writes = MutateArray( - op->writes, std::bind(&RenewDefMutator::VisitBufferRegion, this, std::placeholders::_1)); + Array reads = + op->reads.Map(std::bind(&RenewDefMutator::VisitBufferRegion, this, std::placeholders::_1)); + Array writes = + op->writes.Map(std::bind(&RenewDefMutator::VisitBufferRegion, this, std::placeholders::_1)); // Step 5. Regenerate block. Since the defs are changed, we need to create a new block auto n = make_object(*op); @@ -203,9 +201,9 @@ class RenewDefMutator : public StmtExprMutator { // update data Var data = Downcast(redefine_if_is_var(buffer->data)); // update shape - Array shape = MutateArray(buffer->shape, redefine_if_is_var); + Array shape = buffer->shape.Map(redefine_if_is_var); // update strides - Array strides = MutateArray(buffer->strides, redefine_if_is_var); + Array strides = buffer->strides.Map(redefine_if_is_var); // update elem_offset PrimExpr elem_offset = redefine_if_is_var(buffer->elem_offset); @@ -242,10 +240,10 @@ class RenewDefMutator : public StmtExprMutator { return Downcast((*it).second); } Var data = Downcast(VisitExpr(buffer->data)); - Array shape = MutateArray( - buffer->shape, std::bind(&RenewDefMutator::VisitExpr, this, std::placeholders::_1)); - Array strides = MutateArray( - buffer->strides, std::bind(&RenewDefMutator::VisitExpr, this, std::placeholders::_1)); + Array shape = + buffer->shape.Map(std::bind(&RenewDefMutator::VisitExpr, this, std::placeholders::_1)); + Array strides = + buffer->strides.Map(std::bind(&RenewDefMutator::VisitExpr, this, std::placeholders::_1)); PrimExpr elem_offset = VisitExpr(buffer->elem_offset); auto n = make_object(*buffer.get()); @@ -276,9 +274,8 @@ class RenewDefMutator : public StmtExprMutator { BufferRegion VisitBufferRegion(const BufferRegion& buffer_region) { Buffer buffer = VisitBuffer(buffer_region->buffer); - Array region = - MutateArray(buffer_region->region, - std::bind(&RenewDefMutator::VisitRange, this, std::placeholders::_1)); + Array region = buffer_region->region.Map( + std::bind(&RenewDefMutator::VisitRange, this, std::placeholders::_1)); if (buffer.same_as(buffer_region->buffer) && region.same_as(buffer_region->region)) { return buffer_region; } else { diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc index 5c5a47e86a9a..3cc17847e69b 100644 --- a/src/tir/transforms/vectorize_loop.cc +++ b/src/tir/transforms/vectorize_loop.cc @@ -379,8 +379,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor(op); auto fmutate = [this](const PrimExpr& index) { return this->VisitExpr(index); }; - Array indices = op->indices; - indices.MutateByApply(fmutate); + Array indices = op->indices.Map(fmutate); if (!indices.same_as(op->indices)) { auto writer = load.CopyOnWrite(); @@ -428,8 +427,7 @@ class Vectorizer : public StmtMutator, public ExprFunctor(op); auto fmutate = [this](const PrimExpr& index) { return this->VisitExpr(index); }; - Array indices = op->indices; - indices.MutateByApply(fmutate); + Array indices = op->indices.Map(fmutate); PrimExpr value = this->VisitExpr(op->value); diff --git a/tests/cpp/container_test.cc b/tests/cpp/container_test.cc index f6c4fb4b67d6..d75a510d0c95 100644 --- a/tests/cpp/container_test.cc +++ b/tests/cpp/container_test.cc @@ -173,6 +173,141 @@ TEST(Array, Mutate) { ICHECK(list2[1].same_as(z)); } +TEST(Array, MutateInPlaceForUniqueReference) { + using namespace tvm; + Var x("x"); + Array arr{x, x}; + ICHECK(arr.unique()); + auto* before = arr.get(); + + arr.MutateByApply([](Var) { return Var("y"); }); + auto* after = arr.get(); + ICHECK_EQ(before, after); +} + +TEST(Array, CopyWhenMutatingNonUniqueReference) { + using namespace tvm; + Var x("x"); + Array arr{x, x}; + Array arr2 = arr; + + ICHECK(!arr.unique()); + auto* before = arr.get(); + + arr.MutateByApply([](Var) { return Var("y"); }); + auto* after = arr.get(); + ICHECK_NE(before, after); +} + +TEST(Array, Map) { + // Basic functionality + using namespace tvm; + Var x("x"); + Var y("y"); + Array var_arr{x, y}; + Array expr_arr = var_arr.Map([](Var var) -> PrimExpr { return var + 1; }); + + ICHECK_NE(var_arr.get(), expr_arr.get()); + ICHECK(expr_arr[0]->IsInstance()); + ICHECK(expr_arr[1]->IsInstance()); + ICHECK(expr_arr[0].as()->a.same_as(x)); + ICHECK(expr_arr[1].as()->a.same_as(y)); +} + +TEST(Array, MapToSameTypeWithoutCopy) { + // If the applied map doesn't alter the contents, we can avoid a + // copy. + using namespace tvm; + Var x("x"); + Var y("y"); + Array var_arr{x, y}; + Array var_arr2 = var_arr.Map([](Var var) { return var; }); + + ICHECK_EQ(var_arr.get(), var_arr2.get()); +} + +TEST(Array, MapToSameTypeWithCopy) { + // If the applied map does alter the contents, we need to make a + // copy. The loop in this test is to validate correct behavior + // regardless of where the first discrepancy occurs. + using namespace tvm; + Var x("x"); + Var y("y"); + Var z("z"); + Var replacement("replacement"); + for (size_t i = 0; i < 2; i++) { + Array var_arr{x, y, z}; + Var to_replace = var_arr[i]; + Array var_arr2 = + var_arr.Map([&](Var var) { return var.same_as(to_replace) ? replacement : var; }); + + ICHECK_NE(var_arr.get(), var_arr2.get()); + + // The original array is unchanged + ICHECK_EQ(var_arr.size(), 3); + ICHECK(var_arr[0].same_as(x)); + ICHECK(var_arr[1].same_as(y)); + + // The returned array has one of the elements replaced. + ICHECK_EQ(var_arr2.size(), 3); + ICHECK(var_arr2[i].same_as(replacement)); + ICHECK(i == 0 || var_arr2[0].same_as(x)); + ICHECK(i == 1 || var_arr2[1].same_as(y)); + ICHECK(i == 2 || var_arr2[2].same_as(z)); + } +} + +TEST(Array, MapToSuperclassWithoutCopy) { + // If a map is converting to a superclass, and the mapping function + // array doesn't change the value other than a cast, we can avoid a + // copy. + using namespace tvm; + Var x("x"); + Var y("y"); + Array var_arr{x, y}; + Array expr_arr = var_arr.Map([](Var var) { return PrimExpr(var); }); + + ICHECK_EQ(var_arr.get(), expr_arr.get()); +} + +TEST(Array, MapToSubclassWithoutCopy) { + // If a map is converting to a subclass, and the mapped array + // happens to only contain instances of that subclass, we can + // able to avoid a copy. + using namespace tvm; + Var x("x"); + Var y("y"); + Array expr_arr{x, y}; + Array var_arr = expr_arr.Map([](PrimExpr expr) -> Var { return Downcast(expr); }); + + ICHECK_EQ(var_arr.get(), expr_arr.get()); +} + +TEST(Array, MapToOptionalWithoutCopy) { + // Optional and T both have the same T::ContainerType, just with + // different interfaces for handling `T::data_ == nullptr`. + using namespace tvm; + Var x("x"); + Var y("y"); + Array var_arr{x, y}; + Array> opt_arr = var_arr.Map([](Var var) { return Optional(var); }); + + ICHECK_EQ(var_arr.get(), opt_arr.get()); +} + +TEST(Array, MapFromOptionalWithoutCopy) { + // Optional and T both have the same T::ContainerType, just with + // different interfaces for handling `T::data_ == nullptr`. + using namespace tvm; + Var x("x"); + Var y("y"); + Array> opt_arr{x, y}; + Array var_arr = + opt_arr.Map([](Optional var) { return var.value_or(Var("undefined")); }); + + ICHECK_EQ(var_arr.get(), opt_arr.get()); +} + TEST(Array, Iterator) { using namespace tvm; Array array{1, 2, 3}; From 52dbf102cdba1186e517977ee02aaa7bbe46d0df Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Tue, 20 Sep 2022 14:55:16 -0700 Subject: [PATCH 217/704] Fix caffe, boost install in Python venvs by creating python3.X link (#12828) * Fix caffe, boost install in Python venvs by creating python3.X link. * Use getsitepackages() --- docker/install/ubuntu_install_boost.sh | 3 +++ docker/install/ubuntu_install_caffe.sh | 4 ++-- docker/install/ubuntu_install_python.sh | 7 +++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/docker/install/ubuntu_install_boost.sh b/docker/install/ubuntu_install_boost.sh index e226bbc5d96e..1c04c8a8ff20 100755 --- a/docker/install/ubuntu_install_boost.sh +++ b/docker/install/ubuntu_install_boost.sh @@ -24,6 +24,9 @@ cleanup() { trap cleanup 0 +# NOTE: by default, tvm-venv python is used. Install boost on the system. +PATH=${PATH/${TVM_VENV}\/bin:/} + curl -LO https://boostorg.jfrog.io/artifactory/main/release/1.67.0/source/boost_1_67_0.tar.gz BOOST_HASH=8c247e040303a97895cee9c9407ef205e2c3ab09f0b8320997835ad6221dff23a87231629498ccfd0acca473f74e9ec27b8bd774707b062228df1e5f72d44c92 echo "$BOOST_HASH" boost_1_67_0.tar.gz | sha512sum -c diff --git a/docker/install/ubuntu_install_caffe.sh b/docker/install/ubuntu_install_caffe.sh index 4d9763b69aa3..1e42270e267a 100755 --- a/docker/install/ubuntu_install_caffe.sh +++ b/docker/install/ubuntu_install_caffe.sh @@ -65,5 +65,5 @@ cd / && rm -rf /caffe_src PYCAFFE_ROOT=${CAFFE_HOME}/python echo "${CAFFE_HOME}/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig -VENV_SITE_PACKAGE=$(pip3 show numpy | grep "Location:" | cut -d ' ' -f 2) -ln -s ${PYCAFFE_ROOT}/caffe ${VENV_SITE_PACKAGE}/caffe +site_packages=$("${TVM_VENV}/bin/python3" -c 'import site; print(site.getsitepackages()[0])') +ln -s ${PYCAFFE_ROOT}/caffe "${site_packages}/caffe" diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh index 66a80e1fdc52..fb31c41dccea 100755 --- a/docker/install/ubuntu_install_python.sh +++ b/docker/install/ubuntu_install_python.sh @@ -65,6 +65,13 @@ mkdir -p "${venv_dir}" python3 -mvenv "${TVM_VENV}" . "${TVM_VENV}/bin/activate" +# NOTE: Only in python3.9 does venv guarantee it creates the python3.X binary. +# This is needed so that cmake's find_package(PythonInterp) works inside the venv. +# See https://bugs.python.org/issue39656 +if [ ! -e "${TVM_VENV}/bin/python${PYTHON_VERSION}" ]; then + ln -s "${TVM_VENV}/bin/python" "${TVM_VENV}/bin/python${PYTHON_VERSION}" +fi + # Update pip to match version used to produce requirements-hashed.txt. This step # is necessary so that pip's dependency solver is recent. pip_spec=$(cat /install/python/bootstrap/lockfiles/constraints-${PYTHON_VERSION}.txt | grep 'pip==') From fa5045bf6923c94758e15a7fad7c0904440a4698 Mon Sep 17 00:00:00 2001 From: masahi Date: Wed, 21 Sep 2022 09:32:12 +0900 Subject: [PATCH 218/704] [Metaschedule] MultiLevelTiling for wide vector architectures (#12845) * [Metaschedule] Introduce MultiLevelTiling for wide vector architecture * update test * format * cpplint --- include/tvm/meta_schedule/schedule_rule.h | 15 +++ .../meta_schedule/schedule_rule/__init__.py | 1 + .../schedule_rule/multi_level_tiling.py | 37 ++++++ .../schedule_rule/multi_level_tiling.cc | 35 +++-- .../schedule_rule/multi_level_tiling.h | 3 + .../multi_level_tiling_wide_vector.cc | 120 ++++++++++++++++++ .../test_meta_schedule_schedule_rule_mlt.py | 108 +++++++++++++++- 7 files changed, 307 insertions(+), 12 deletions(-) create mode 100644 src/meta_schedule/schedule_rule/multi_level_tiling_wide_vector.cc diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h index 55704cf4a97d..2c9da1df9dae 100644 --- a/include/tvm/meta_schedule/schedule_rule.h +++ b/include/tvm/meta_schedule/schedule_rule.h @@ -187,6 +187,21 @@ class ScheduleRule : public runtime::ObjectRef { Optional> vector_load_lens, Optional> reuse_read, Optional> reuse_write, bool use_software_pipeline); + /*! + * \brief Extension of MultiLevelTiling for backends with wide vectors. + * The loop over the innermost spatial axis of the output buffer is always vectorized with the + * maximum vector length. + * \param structure The tiling structure. 'SSRSRS' is recommended. + * \param vector_length_in_bits The length of a vector register in bits. + * \param max_innermost_factor The maximum size of the innermost factor. NullOpt means no limit + * \param reuse_read Data reuse configuration for reading. NullOpt means no reuse. + * \param reuse_write Data reuse configuration for writing. NullOpt means no reuse. + * \return The schedule rule created + */ + TVM_DLL static ScheduleRule MultiLevelTilingWideVector( + String structure, Integer vector_length_in_bits, Optional max_innermost_factor, + Optional> reuse_read, Optional> reuse_write); + /*! * \brief Create a rule: add-rfactor to some blocks if needed * \param max_jobs_per_core The maximum number of jobs to be launched per CPU core. It sets the diff --git a/python/tvm/meta_schedule/schedule_rule/__init__.py b/python/tvm/meta_schedule/schedule_rule/__init__.py index dd0119b0a7f8..a015d0eb1ab2 100644 --- a/python/tvm/meta_schedule/schedule_rule/__init__.py +++ b/python/tvm/meta_schedule/schedule_rule/__init__.py @@ -28,6 +28,7 @@ MultiLevelTilingWithIntrin, ReuseType, MultiLevelTilingTensorCore, + MultiLevelTilingWideVector, ) from .parallel_vectorize_unroll import ParallelizeVectorizeUnroll from .random_compute_location import RandomComputeLocation diff --git a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py index 6703bc5716e9..e91382dd017a 100644 --- a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py +++ b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py @@ -187,3 +187,40 @@ def __init__( reuse_write.as_dict() if reuse_write is not None else None, use_software_pipeline, ) + + +@register_object("meta_schedule.MultiLevelTilingWideVector") +class MultiLevelTilingWideVector(ScheduleRule): + """Extension of MultiLevelTiling for backends with wide vectors. The loop over the innermost + spatial axis of the output buffer is always vectorized with the maximum vector length. + + Parameters + ---------- + structure : str + The tiling structure. 'SSRSRS' is recommended. + vector_length_in_bits: int + The length of a vector register in bits. + max_innermost_factor : Optional[int] + The maximum size of the innermost factor. None means no limit + reuse_read : Optional[ReuseType] + Data reuse configuration for reading. None means no reuse. + reuse_write : Optional[ReuseType] + Data reuse configuration for writing. None means no reuse. + """ + + def __init__( + self, + structure: str, + vector_length_in_bits: int, + max_innermost_factor: Optional[int] = None, + reuse_read: Optional[ReuseType] = None, + reuse_write: Optional[ReuseType] = None, + ) -> None: + self.__init_handle_by_constructor__( + _ffi_api.ScheduleRuleMultiLevelTilingWideVector, # type: ignore # pylint: disable=no-member + structure, + vector_length_in_bits, + max_innermost_factor, + reuse_read.as_dict() if reuse_read is not None else None, + reuse_write.as_dict() if reuse_write is not None else None, + ) diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc index 1625a27b9aaf..2ae6714f55d8 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc @@ -166,6 +166,17 @@ std::vector MultiLevelTilingNode::AddWriteReuse(State state) const { return results; } +Array MultiLevelTilingNode::SplitLoop(const Schedule& sch, BlockRV block, LoopRV loop, + int n_tiles) const { + Array factors = sch->SamplePerfectTile( + /*loop=*/loop, + /*n=*/n_tiles, + /*max_innermost_factor=*/max_innermost_factor); + Array splits = sch->Split(/*loop=*/loop, + /*factors=*/{factors.begin(), factors.end()}); + return splits; +} + std::vector MultiLevelTilingNode::TileLoopNest(State state) const { Schedule& sch = state->sch; const BlockRV& block_rv = state->block_rv; @@ -179,6 +190,7 @@ std::vector MultiLevelTilingNode::TileLoopNest(State state) const { for (int i = 0, n = loops.size(); i < n; ++i) { LoopRV loop = loops[i]; const std::vector* idx = nullptr; + if (iter_types[i] == IterVarType::kDataPar) { idx = &s_indices_; if (spatial_loop_product != -1) { @@ -193,17 +205,18 @@ std::vector MultiLevelTilingNode::TileLoopNest(State state) const { } else { continue; } - // Do the split - int n_tiles = idx->size(); - Array factors = sch->SamplePerfectTile( - /*loop=*/loop, - /*n=*/n_tiles, - /*max_innermost_factor=*/max_innermost_factor); - Array splits = sch->Split(/*loop=*/loop, - /*factors=*/{factors.begin(), factors.end()}); - // Put every tile to its slot - for (int j = 0; j < n_tiles; ++j) { - tiles[idx->at(j)].push_back(splits[j]); + + const int n_tiles = idx->size(); + + if (n_tiles == 1) { + tiles[idx->at(0)].push_back(loop); + } else { + auto splits = SplitLoop(sch, block_rv, loop, n_tiles); + + // Put every tile to its slot + for (int j = 0; j < n_tiles; ++j) { + tiles[idx->at(j)].push_back(splits[j]); + } } } // Step 3. Reorder to organize the tiles diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.h b/src/meta_schedule/schedule_rule/multi_level_tiling.h index 47da878c3be0..8f55e8e7e4e4 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling.h +++ b/src/meta_schedule/schedule_rule/multi_level_tiling.h @@ -161,6 +161,9 @@ class MultiLevelTilingNode : public ScheduleRuleNode { protected: virtual std::vector ApplySubRules(std::vector states); + virtual Array SplitLoop(const tir::Schedule& sch, tir::BlockRV block, + tir::LoopRV loop, int n_tiles) const; + // Annotate a block to use cooperative fetching void AnnotateCooperativeFetching(tir::Schedule* sch, const tir::BlockRV& block) const; diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_wide_vector.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_wide_vector.cc new file mode 100644 index 000000000000..f5ec009a9b28 --- /dev/null +++ b/src/meta_schedule/schedule_rule/multi_level_tiling_wide_vector.cc @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "../../tir/schedule/analysis.h" +#include "../../tir/schedule/transform.h" +#include "../utils.h" +#include "multi_level_tiling.h" + +namespace tvm { +namespace meta_schedule { + +using tir::BlockRV; +using tir::LoopRV; +using tir::Schedule; + +/*! + * \brief Extension of MultiLevelTiling for backends with wide vectors. + * The loop over the innermost spatial axis of the output buffer is always vectorized with the + * maximum vector length. + */ +class MultiLevelTilingWideVectorNode : public MultiLevelTilingNode { + public: + size_t vector_length_in_bits; + + static constexpr const char* _type_key = "meta_schedule.MultiLevelTilingWideVector"; + TVM_DECLARE_FINAL_OBJECT_INFO(MultiLevelTilingWideVectorNode, MultiLevelTilingNode); + + protected: + Array SplitLoop(const Schedule& sch, BlockRV block, LoopRV loop, int n_tiles) const; +}; + +Array MultiLevelTilingWideVectorNode::SplitLoop(const Schedule& sch, BlockRV block_rv, + LoopRV loop_rv, int n_tiles) const { + const tir::ForNode* loop = TVM_SREF_TO_FOR(sch->GetSRef(loop_rv)); + const tir::StmtSRef block_sref = sch->GetSRef(block_rv); + const tir::BlockNode* block_node = block_sref->StmtAs(); + const tir::BlockRealize block_realize = tir::GetBlockRealize(sch->state(), block_sref); + ICHECK(block_node && block_node->writes.size() == 1); + + const auto out_dtype = block_node->writes[0]->buffer->dtype; + const int vec_len = vector_length_in_bits / out_dtype.bits(); + + // Determine if this loop is over the innermost axis of the output buffer. + // In the example below, we look for a loop whose loop var is bound to the axis co. + + // for (i0, 0, 1) { + // for (i1, 0, 56) { + // for (i2, 0, 56) { + // for (i3, 0, 64) { + // for (i4, 0, 3) { + // for (i5, 0, 3) { + // for (i6, 0, 64) { + // block conv2d_nhwc(...) { + // ... + // bind(co, i3) + // ... + // writes([conv2d_nhwc[n, h, w, co]]) + // ... + // conv2d_nhwc[n, h, w, co] = ... + // } + const size_t innermost_axis = block_node->writes[0]->region.size() - 1; + const PrimExpr innermost_iter_value = block_realize->iter_values[innermost_axis]; + + if (!arith::Analyzer().CanProve(loop->loop_var == innermost_iter_value)) { + // If this is not the innermost spatial loop, split the loop in the normal way. + return MultiLevelTilingNode::SplitLoop(sch, block_rv, loop_rv, n_tiles); + } else { + // We split the innermost spatial loop in a way that always uses the maximum vector length. + const int64_t* extent_int = tir::GetLoopIntExtent(loop); + if (extent_int && *extent_int > vec_len) { + Array inner_splits = sch->Split(/*loop=*/loop_rv, + /*factors=*/{NullOpt, PrimExpr(vec_len)}); + Array outer_factors = sch->SamplePerfectTile( + /*loop=*/inner_splits[0], + /*n=*/n_tiles - 1, + /*max_innermost_factor=*/max_innermost_factor); + Array outer_splits = sch->Split( + /*loop=*/inner_splits[0], /*factors=*/{outer_factors.begin(), outer_factors.end()}); + outer_splits.push_back(inner_splits[1]); + return outer_splits; + } else { + Array factors(n_tiles - 1, PrimExpr(1)); + factors.push_back(loop->extent); + return sch->Split(/*loop=*/loop_rv, + /*factors=*/{factors.begin(), factors.end()}); + } + } +} + +ScheduleRule ScheduleRule::MultiLevelTilingWideVector( + String structure, Integer vector_length_in_bits, Optional max_innermost_factor, + Optional> reuse_read, Optional> reuse_write) { + auto node = MultiLevelTilingInitCommon( + structure, NullOpt, max_innermost_factor, NullOpt, reuse_read, reuse_write); + node->vector_length_in_bits = vector_length_in_bits->value; + return ScheduleRule(node); +} + +TVM_REGISTER_NODE_TYPE(MultiLevelTilingWideVectorNode); +TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleMultiLevelTilingWideVector") + .set_body_typed(ScheduleRule::MultiLevelTilingWideVector); + +} // namespace meta_schedule +} // namespace tvm diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py index 939ccbe54fa6..d9d078106333 100644 --- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py +++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt.py @@ -16,7 +16,7 @@ # under the License. # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring from tvm import meta_schedule as ms -from tvm import te +from tvm import te, target from tvm.meta_schedule.testing import te_workload from tvm.meta_schedule.testing.schedule_rule import get_rules from tvm.meta_schedule.testing.space_generation import check_sketches @@ -521,9 +521,115 @@ def sum_with_trivial_block_iter( assert not sch.trace.simplified(remove_postproc=True).insts +def test_multi_level_tiling_hexagon(): + @T.prim_func + def cpu_conv2d_nhwc( + inputs: T.Buffer[(1, 56, 56, 64), "float16"], + weight: T.Buffer[(3, 3, 64, 64), "float16"], + conv2d_nhwc: T.Buffer[(1, 56, 56, 64), "float16"], + ) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + PadInput = T.alloc_buffer([1, 58, 58, 64], dtype="float16") + for i0, i1, i2, i3 in T.grid(1, 58, 58, 64): + with T.block("PadInput"): + i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3]) + T.reads(inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1]) + T.writes(PadInput[i0_1, i1_1, i2_1, i3_1]) + PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else( + 1 <= i1_1 and i1_1 < 57 and 1 <= i2_1 and i2_1 < 57, + inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1], + T.float16(0), + dtype="float16", + ) + for ( + i0_0, + i1_0, + i2_0, + i3_0, + i4_0, + i5_0, + i6_0, + i0_1_1, + i1_1_1, + i2_1_1, + i3_1_1, + i4_1, + i5_1, + i6_1, + i0_2, + i1_2, + i2_2, + i3_2, + ) in T.grid(1, 1, 2, 1, 3, 3, 16, 1, 14, 2, 1, 1, 1, 4, 1, 4, 14, 64): + with T.block("conv2d_nhwc"): + n = T.axis.spatial(1, i0_1_1 + i0_2 + i0_0) + h = T.axis.spatial(56, i1_0 * 56 + i1_1_1 * 4 + i1_2) + w = T.axis.spatial(56, i2_0 * 28 + i2_1_1 * 14 + i2_2) + co = T.axis.spatial(64, i3_0 * 64 + i3_1_1 * 64 + i3_2) + rh = T.axis.reduce(3, i4_1 + i4_0) + rw = T.axis.reduce(3, i5_0 + i5_1) + rc = T.axis.reduce(64, i6_0 * 4 + i6_1) + T.reads(PadInput[n, h + rh, w + rw, co // 64 * 64 + rc], weight[rh, rw, rc, co]) + T.writes(conv2d_nhwc[n, h, w, co]) + T.block_attr({"meta_schedule.tiling_structure": "SRSRS"}) + with T.init(): + conv2d_nhwc[n, h, w, co] = T.float16(0) + conv2d_nhwc[n, h, w, co] = ( + conv2d_nhwc[n, h, w, co] + + PadInput[n, h + rh, w + rw, co // 64 * 64 + rc] * weight[rh, rw, rc, co] + ) + + target_hexagon = target.hexagon("v69", num_cores=4) + + I = 64 + O = 64 + H = 56 + W = 56 + + mod = te.create_prim_func( + te_workload.conv2d_nhwc(1, H, W, I, O, 3, 1, 1, 1, in_dtype="float16", out_dtype="float16") + ) + + actual = ms.TuneContext( + mod=mod, + target=Target(target_hexagon, host=target_hexagon), + space_generator=ms.space_generator.PostOrderApply(), + sch_rules=[ + ms.schedule_rule.MultiLevelTilingWideVector( + structure="SRSRS", + vector_length_in_bits=1024, + max_innermost_factor=64, + reuse_read=None, + reuse_write=None, + ) + ], + task_name="test", + ).generate_design_space() + + decision_0 = [ + ("SamplePerfectTile", [1, 1, 1]), + ("SamplePerfectTile", [1, 14, 4]), + ("SamplePerfectTile", [2, 2, 14]), + ("SamplePerfectTile", [3, 1]), + ("SamplePerfectTile", [3, 1]), + ("SamplePerfectTile", [16, 4]), + ] + + check_sketches( + mod, + sketches=actual, + expected_mods=[cpu_conv2d_nhwc], + expected_decisions=[decision_0], + ) + + if __name__ == "__main__": test_cpu_matmul() test_cpu_matmul_relu() test_cuda_matmul() test_cuda_matmul_relu() test_cuda_sum_with_trivial_block_iter() + test_multi_level_tiling_hexagon() From d4e3207cca1bae532e6e616eca2e80191e45b437 Mon Sep 17 00:00:00 2001 From: "yin.changsheng" Date: Wed, 21 Sep 2022 11:03:14 +0800 Subject: [PATCH 219/704] [TIR] Enhance RemoveNoOp pass to remove negative loop (#12836) --- src/tir/transforms/remove_no_op.cc | 11 +++++++++++ .../unittest/test_tir_transform_remove_no_op.py | 16 +++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/tir/transforms/remove_no_op.cc b/src/tir/transforms/remove_no_op.cc index ce0d9b87c433..8728817aad57 100644 --- a/src/tir/transforms/remove_no_op.cc +++ b/src/tir/transforms/remove_no_op.cc @@ -31,6 +31,7 @@ #include +#include "../../arith/const_fold.h" #include "ir_utils.h" namespace tvm { @@ -87,7 +88,14 @@ class NoOpRemover : public StmtMutator { } } Stmt VisitStmt_(const ForNode* op) final { + var_range_map_[op->loop_var.get()] = arith::IntSet::FromMinExtent(op->min, op->extent); + auto extent_range = arith::EvalSet(op->extent, var_range_map_); + if (!arith::is_neg_inf(extent_range.max()) && !arith::is_pos_inf(extent_range.max()) && + analyzer_.CanProve(extent_range.max() <= 0)) { + return Evaluate(0); + } Stmt stmt = StmtMutator::VisitStmt_(op); + var_range_map_.erase(op->loop_var.get()); op = stmt.as(); if (is_zero(op->extent)) { return Evaluate(0); @@ -162,6 +170,9 @@ class NoOpRemover : public StmtMutator { } return stmt.defined() ? stmt : Evaluate(0); } + + std::unordered_map var_range_map_; + arith::Analyzer analyzer_; }; Stmt RemoveNoOp(Stmt stmt) { return NoOpRemover()(std::move(stmt)); } diff --git a/tests/python/unittest/test_tir_transform_remove_no_op.py b/tests/python/unittest/test_tir_transform_remove_no_op.py index e80d46193507..820e32eb7e72 100644 --- a/tests/python/unittest/test_tir_transform_remove_no_op.py +++ b/tests/python/unittest/test_tir_transform_remove_no_op.py @@ -16,6 +16,8 @@ # under the License. import tvm from tvm import te +from tvm.script import tir as T +import tvm.testing def nop(): @@ -68,5 +70,17 @@ def test_remove_no_op(): assert isinstance(ret, tvm.tir.Evaluate) +def test_remove_no_op_with_invalid_extent(): + @T.prim_func + def main(A: T.Buffer[(16), "int32"], B: T.Buffer[(16), "int32"]) -> None: + for i in T.serial(16): + for j in T.serial(i - 20): + B[i] = A[i] + j + + mod = tvm.ir.module.IRModule.from_expr(main) + ret = tvm.tir.transform.RemoveNoOp()(mod)["main"].body + assert isinstance(ret, tvm.tir.Evaluate) + + if __name__ == "__main__": - test_remove_no_op() + tvm.testing.main() From b051cad9f40671675d7101ac510b6f733cff0bc2 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Tue, 20 Sep 2022 22:51:05 -0700 Subject: [PATCH 220/704] [FIX,PROFILING] Fix gpu timer name and lookup (#12849) * [FIX,PROFILING] Fix gpu timer name and lookup In the switch from gpu to cuda naming, the cuda timer was passed over. Renaming it to "profiling.timer.cuda" so it is correctly picked up by the timing mechanisms. * warn if timer impl does not exist --- src/runtime/cuda/cuda_device_api.cc | 18 ++++++++++-------- src/runtime/profiling.cc | 13 +++++++++++++ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index b4d7b41b7f4a..71788e52999a 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -252,9 +252,11 @@ TVM_REGISTER_GLOBAL("device_api.cuda_host").set_body([](TVMArgs args, TVMRetValu *rv = static_cast(ptr); }); -class GPUTimerNode : public TimerNode { +class CUDATimerNode : public TimerNode { public: virtual void Start() { + // This initial cudaEventRecord is sometimes pretty slow (~100us). Does + // cudaEventRecord do some stream synchronization? CUDA_CALL(cudaEventRecord(start_, CUDAThreadEntry::ThreadLocal()->stream)); } virtual void Stop() { CUDA_CALL(cudaEventRecord(stop_, CUDAThreadEntry::ThreadLocal()->stream)); } @@ -264,27 +266,27 @@ class GPUTimerNode : public TimerNode { CUDA_CALL(cudaEventElapsedTime(&milliseconds, start_, stop_)); return milliseconds * 1e6; } - virtual ~GPUTimerNode() { + virtual ~CUDATimerNode() { CUDA_CALL(cudaEventDestroy(start_)); CUDA_CALL(cudaEventDestroy(stop_)); } - GPUTimerNode() { + CUDATimerNode() { CUDA_CALL(cudaEventCreate(&start_)); CUDA_CALL(cudaEventCreate(&stop_)); } - static constexpr const char* _type_key = "GPUTimerNode"; - TVM_DECLARE_FINAL_OBJECT_INFO(GPUTimerNode, TimerNode); + static constexpr const char* _type_key = "CUDATimerNode"; + TVM_DECLARE_FINAL_OBJECT_INFO(CUDATimerNode, TimerNode); private: cudaEvent_t start_; cudaEvent_t stop_; }; -TVM_REGISTER_OBJECT_TYPE(GPUTimerNode); +TVM_REGISTER_OBJECT_TYPE(CUDATimerNode); -TVM_REGISTER_GLOBAL("profiling.timer.gpu").set_body_typed([](Device dev) { - return Timer(make_object()); +TVM_REGISTER_GLOBAL("profiling.timer.cuda").set_body_typed([](Device dev) { + return Timer(make_object()); }); TVM_DLL String GetCudaFreeMemory() { diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc index 2c92633c34fc..168441d1708d 100644 --- a/src/runtime/profiling.cc +++ b/src/runtime/profiling.cc @@ -89,9 +89,22 @@ TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](Device dev) { return Timer(make_object()); }); +// keep track of which timers are not defined but we have already warned about +std::set seen_devices; +std::mutex seen_devices_lock; + Timer Timer::Start(Device dev) { auto f = Registry::Get(std::string("profiling.timer.") + DeviceName(dev.device_type)); if (f == nullptr) { + { + std::lock_guard lock(seen_devices_lock); + if (seen_devices.find(dev.device_type) == seen_devices.end()) { + LOG(WARNING) + << "No timer implementation for " << DeviceName(dev.device_type) + << ", using default timer instead. It may be inaccurate or have extra overhead."; + seen_devices.insert(dev.device_type); + } + } Timer t = DefaultTimer(dev); t->Start(); return t; From fdc6894b7dae096d0ec983292aa0a2a475843f56 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Wed, 21 Sep 2022 08:04:53 -0500 Subject: [PATCH 221/704] [TVMScript][Fix] Correct round-trip of explicit root block (#12673) * [TVMScript][Fix] Correct round-trip of explicit root block Prior to this commit, when converting TIR to TVMScript, the root `tir::Block` is typically hidden. When parsing, however, `tvm::tir::ScriptComplete` will wrap the function body in a root block if the primfunc if the contains at least one block and does not already have a root block. As a result, if the root block is the only block present, it would be stripped by a round-trip. This commit tightens the condition for hiding the root `tir::Block` when converting to TVMScript, so that it is printed in cases where the autocompleter would reinsert it when parsing. --- include/tvm/tir/stmt_functor.h | 32 ++++++++++++ src/printer/tvmscript_printer.cc | 50 ++++++++++++++++--- src/tir/ir/script/script_complete.cc | 37 ++++++++++---- .../unittest/test_tvmscript_roundtrip.py | 21 ++++++++ 4 files changed, 123 insertions(+), 17 deletions(-) diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h index 49b1f28e5d83..2fc3b9678b40 100644 --- a/include/tvm/tir/stmt_functor.h +++ b/include/tvm/tir/stmt_functor.h @@ -427,6 +427,38 @@ TVM_DLL void PreOrderVisit(const ObjectRef& stmt_or_expr, * \return The renewed func. */ TVM_DLL PrimFunc RenewDefs(const PrimFunc& func); + +/*! + * \brief Check if the statement contains the specified node type. + * + * This utility potentially walks the entire statement, and should + * therefore not be used if it could otherwise be merged with another + * pass. + * + * \param stmt The statement to be searched + * \return Whether stmt contains Node + */ +template >> +bool ContainsNode(const Stmt& stmt) { + struct Visitor : StmtVisitor { + // Early bail-out, if we already found the node. + void VisitStmt(const Stmt& stmt) { + if (contains_node) { + return; + } + StmtVisitor::VisitStmt(stmt); + } + + void VisitStmt_(const Node* block) override { contains_node = true; } + + bool contains_node{false}; + }; + + Visitor visitor; + visitor(stmt); + return visitor.contains_node; +} + } // namespace tir } // namespace tvm diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc index 20720373589f..936ac7580f28 100644 --- a/src/printer/tvmscript_printer.cc +++ b/src/printer/tvmscript_printer.cc @@ -1664,19 +1664,53 @@ Doc TVMScriptPrinter::PrintPrimFunc(const PrimFunc& primFunc) { } // print body body << "# body" << Doc::NewLine(); - if (op->body->IsInstance() && - op->body.as()->iter_values.empty()) { - const BlockNode* block = op->body.as()->block.get(); - if (block->annotations.empty() && !ContainsOptionalInfo(GetRef(block))) { - // Skip print root block - body << "# with " << tir_prefix_ << ".block(\"root\")" << Doc::NewLine(); - body << PrintBlockBody(block); + + Optional elided_root_block_body = [&]() -> Optional { + auto block_realize = op->body.as(); + if (!block_realize || block_realize->iter_values.size()) { + return NullOpt; + } + + const auto& block = block_realize->block; + if (block->annotations.size() || ContainsOptionalInfo(block)) { + return NullOpt; + } + + // The autocomplete might recognize the body itself as being a + // root block, and fail to insert it. + bool autocomplete_would_insert_root_block = [&]() -> bool { + if (block->alloc_buffers.size()) { + return true; + } + + auto* block_realize = block->body.as(); + if (block_realize && block_realize->block->iter_vars.size()) { + return true; + } + if (!block_realize && ContainsNode(block->body)) { + return true; + } + return false; + }(); + + if (autocomplete_would_insert_root_block) { + return block; } else { - body << PrintBody(op->body); + return NullOpt; } + }(); + + if (elided_root_block_body) { + // Skip printing of root block in cases where tvm::tir::ScriptComplete + // would re-insert it. + body << "# with " << tir_prefix_ << ".block(\"root\")" << Doc::NewLine(); + body << PrintBlockBody(elided_root_block_body.value().get()); } else { + // If this is a non-root block, or is an unskippable root block, + // just print it without skipping. body << PrintBody(op->body); } + // print func attrs Doc header_attr; if (primFunc->attrs.defined()) { diff --git a/src/tir/ir/script/script_complete.cc b/src/tir/ir/script/script_complete.cc index b11ca6650a14..c44083108d45 100644 --- a/src/tir/ir/script/script_complete.cc +++ b/src/tir/ir/script/script_complete.cc @@ -105,16 +105,35 @@ PrimFunc ScriptComplete(PrimFunc func, const Array& root_allocates) { for (const auto& alloc : root_allocates) { buffer_var_map.Set(alloc->data, alloc); } - bool contain_root = root_allocates.empty() && func->body->IsInstance() && - Downcast(func->body)->block->iter_vars.empty(); - ScriptCompleter script_completer(&buffer_var_map); - // generate surrounding loops automatically - Stmt res = script_completer(func->body); - // generate root block automatically - if ((script_completer.contains_block || root_allocates.size()) && !contain_root) { - res = Block({}, {}, {}, "root", res, NullOpt, root_allocates); - res = BlockRealize({}, Bool(true), Downcast(res)); + + Stmt res = func->body; + + // Generate root block automatically. This is done before + // ScriptCompleter, in order to fill the root block's T.reads() and + // T.writes() annotations, as if it had been explicitly written. + bool should_insert_root = [&]() -> bool { + if (root_allocates.size()) { + return true; + } + auto* block_realize = func->body.as(); + if (block_realize && block_realize->block->iter_vars.size()) { + return true; + } + if (!block_realize && ContainsNode(func->body)) { + return true; + } + return false; + }(); + + if (should_insert_root) { + Block root_block({}, {}, {}, "root", std::move(res), NullOpt, root_allocates); + res = BlockRealize({}, Bool(true), std::move(root_block)); } + + // generate surrounding loops automatically + ScriptCompleter script_completer(&buffer_var_map); + res = script_completer(std::move(res)); + if (func->body.same_as(res)) { return func; } else { diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py index 1f5871b488e2..e139d2111bee 100644 --- a/tests/python/unittest/test_tvmscript_roundtrip.py +++ b/tests/python/unittest/test_tvmscript_roundtrip.py @@ -3142,6 +3142,25 @@ def func_root_attr(): return func_root_attr +def func_trivial_root_block(): + @T.prim_func + def func(A: T.Buffer[1, "int32"]): + with T.block("root"): + A[0] = 0 + + return func + + +def func_nested_root_block(): + @T.prim_func + def func(A: T.Buffer[1, "int32"]): + with T.block("root"): + with T.block("block"): + A[0] = 0 + + return func + + def func_T_ptr_let_statement(): @T.prim_func def func_T_ptr_let_statement( @@ -3418,6 +3437,8 @@ def func() -> None: func_with_target_spec_by_config, func_with_target_spec_by_str, func_root_attr, + func_trivial_root_block, + func_nested_root_block, func_T_ptr_let_statement, func_T_ptr_allocate, llvm_intrin_call, From da0e5e3be2834b214ca7035fb50d9d378ecc5c52 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Wed, 21 Sep 2022 11:13:49 -0500 Subject: [PATCH 222/704] [Utils] Disable automatic move constructor for tvm::With (#12822) * [Utils] Move constructor for tvm::With Previously, `tvm::With` had the default compiler-provided move constructors. If these were used (e.g. by storing a `With` into a vector), the `ExitWithScope` would be called multiple times. This commit explicitly removes the copy constructor/assignment, and explicitly implements move constructor/assignment. * Update PR to remove move With's constructor/assignment altogether --- include/tvm/support/with.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/tvm/support/with.h b/include/tvm/support/with.h index d28e9f3a6894..5959affafdb3 100644 --- a/include/tvm/support/with.h +++ b/include/tvm/support/with.h @@ -68,6 +68,15 @@ class With { /*! \brief destructor, leaves the scope of the context. */ ~With() DMLC_THROW_EXCEPTION { ctx_.ExitWithScope(); } + // Disable copy and move construction. `With` is intended only for + // use in nested contexts that are exited in the reverse order of + // entry. Allowing context to be copied or moved would break this + // expectation. + With(const With& other) = delete; + With& operator=(const With& other) = delete; + With(With&& other) = delete; + With& operator=(With&& other) = delete; + ContextType* get() { return &ctx_; } const ContextType* get() const { return &ctx_; } From 3c8a94bd4eedb43d5402ec41755a4f57a90ff4fe Mon Sep 17 00:00:00 2001 From: chengven027-intellif Date: Thu, 22 Sep 2022 02:57:30 +0800 Subject: [PATCH 223/704] [frontend][torch] Support aten::relu6 operator (#12855) support aten::relu6 operator --- python/tvm/relay/frontend/pytorch.py | 5 +++++ tests/python/frontend/pytorch/test_forward.py | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index 7c52393b8468..b0e594d99312 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -840,6 +840,10 @@ def relu(self, inputs, input_types): return qnn_torch.quantized_relu(data, input_zero_point) return _op.nn.relu(data) + def relu6(self, inputs, input_types): + data = inputs[0] + return _op.tensor.clip(data, 0.0, 6.0) + def prelu(self, inputs, input_types): # Reference: https://pytorch.org/docs/stable/generated/torch.nn.PReLU.html#torch.nn.PReLU data = inputs[0] @@ -3477,6 +3481,7 @@ def create_convert_map(self): "aten::where": self.where, "aten::topk": self.topk, "aten::relu": self.relu, + "aten::relu6": self.relu6, "aten::prelu": self.prelu, "aten::leaky_relu": self.leaky_relu, "aten::elu": self.elu, diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 2d0a476e372d..0525c5fd8e7d 100755 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -698,6 +698,15 @@ def test_forward_relu(): verify_model(torch.nn.ReLU().eval(), input_data=input_data) +@tvm.testing.uses_gpu +def test_forward_relu6(): + """test_forward_relu6""" + torch.set_grad_enabled(False) + input_shape = [10, 10] + input_data = torch.rand(input_shape).float() + verify_model(torch.nn.ReLU6().eval(), input_data=input_data) + + @tvm.testing.uses_gpu def test_forward_prelu(): """test_forward_prelu""" From c0c7569529cb258c151acf101e6e4650c726d34d Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Wed, 21 Sep 2022 11:58:46 -0700 Subject: [PATCH 224/704] Allow failures in pr_comment_bot for now (#12860) Allow failures in pr_comment_bot for now. --- .github/workflows/pr_comment_bot.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr_comment_bot.yml b/.github/workflows/pr_comment_bot.yml index 89416df928b8..1ac33c77d2e5 100644 --- a/.github/workflows/pr_comment_bot.yml +++ b/.github/workflows/pr_comment_bot.yml @@ -49,7 +49,7 @@ jobs: if [[ "$URL" == *"PR-"* ]]; then echo "PR status, sending comment" PR_NUMBER=$(echo $URL | sed 's/.*PR-//g' | sed 's/\/.*//g') - python ci/scripts/github_pr_comment.py --pr "$PR_NUMBER" + python ci/scripts/github_pr_comment.py --pr "$PR_NUMBER" || /bin/true else echo "Not a PR status, skipping" fi From 7aef584c0f8fb3b516afde3fb5fac9c2d0969c0a Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Wed, 21 Sep 2022 15:14:28 -0700 Subject: [PATCH 225/704] [Hybrid] Fix sys version check (#12837) This is a follow-up to #12769 The check for sys version of python 3.9 is not correct. Fixed #12814 --- python/tvm/te/hybrid/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/te/hybrid/parser.py b/python/tvm/te/hybrid/parser.py index 4956aaf0be32..ec103ac18811 100644 --- a/python/tvm/te/hybrid/parser.py +++ b/python/tvm/te/hybrid/parser.py @@ -374,7 +374,7 @@ def visit_Attribute(self, node): def visit_Subscript(self, node): args = self.visit(node.slice) - if sys.version_info > (3, 8): + if sys.version_info >= (3, 9): if not isinstance(node.slice, ast.Tuple): args = [args] From 39f71ae2881f5c647aa8e98e4f6d87ed84a28688 Mon Sep 17 00:00:00 2001 From: Oleksandr Viazlo Date: Thu, 22 Sep 2022 10:21:17 +0200 Subject: [PATCH 226/704] [frontend][pytorch] Add a new test case for torch aten::fill_ operator implementation (#12857) Fix aten::fill_ torch operator implementation by adding constant folding on the fill value. Add new test case for torch aten::fill_ operator implementation. --- python/tvm/relay/frontend/pytorch.py | 8 ++++++-- tests/python/frontend/pytorch/test_forward.py | 10 ++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index b0e594d99312..e35e23b3381c 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -43,7 +43,7 @@ from .common import infer_shape as _infer_shape from .common import infer_value as _infer_value from .common import infer_value_simulated as _infer_value_simulated -from .common import lstm_cell, try_infer_value, unbind +from .common import lstm_cell, try_infer_value, unbind, fold_constant from .pytorch_utils import is_version_greater_than, getattr_attr_name __all__ = ["from_pytorch"] @@ -672,7 +672,9 @@ def full_impl(self, data, fill_value, dtype): tmp.append(_op.cast(_op.expand_dims(dim, axis=0), "int64")) size = _op.concatenate(tmp, axis=0) - out = _op.full(_expr.const(fill_value, dtype=dtype), size, dtype=dtype) + if not isinstance(fill_value, _expr.Constant): + fill_value = _expr.const(fill_value, dtype=dtype) + out = _op.full(fill_value, size, dtype=dtype) if need_reshape: out = _op.reshape(out, new_shape) return out @@ -805,6 +807,8 @@ def new_full(self, inputs, input_types): def fill_(self, inputs, input_types): data = inputs[0] fill_value = inputs[1] + if not isinstance(fill_value, (bool, int, float, complex)): + fill_value = fold_constant(fill_value) return self.full_impl(self.infer_shape(data), fill_value, input_types[0]) def linspace(self, inputs, input_types): diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 0525c5fd8e7d..5236b763faf0 100755 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -3341,6 +3341,16 @@ def test_func(x): verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()]) +def test_forward_fill_with_div(): + """test_forward_fill_with_div""" + + def test_func(x): + y = torch.div(torch.tensor(6.0), torch.tensor(2.0)) + return x.fill_(y) + + verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()]) + + @tvm.testing.uses_gpu def test_forward_linspace(): """test_forward_linspace""" From fe75f00991f60d4483d2d14f7ec23bb6fda956a9 Mon Sep 17 00:00:00 2001 From: Alexey Voronov Date: Thu, 22 Sep 2022 13:52:28 +0300 Subject: [PATCH 227/704] [AutoTVM] Introducing multi_filter into ConfigSpace autotvm (#12545) * [AutoTVM] Introducing multi_filter into ConfigSpace autotvm Co-authored-by: Andrey Malyshev elvin.nnov@gmail.com Co-authored-by: Egor Churaev egor.churaev@gmail.com * update multi_filter for adreno conv's --- python/tvm/autotvm/task/space.py | 330 +++++++++++++++++- python/tvm/autotvm/tuner/ga_tuner.py | 108 +++--- python/tvm/autotvm/tuner/index_based_tuner.py | 73 ++-- python/tvm/autotvm/tuner/model_based_tuner.py | 40 +-- .../tvm/autotvm/tuner/sa_model_optimizer.py | 39 +-- python/tvm/autotvm/tuner/tuner.py | 1 + python/tvm/autotvm/utils.py | 32 -- python/tvm/topi/adreno/conv2d_nchw.py | 10 +- python/tvm/topi/adreno/conv2d_nhwc.py | 10 +- .../tvm/topi/adreno/conv2d_winograd_common.py | 7 +- .../tvm/topi/adreno/depthwise_conv2d_nchw.py | 9 + .../tvm/topi/adreno/depthwise_conv2d_nhwc.py | 9 + .../test_topi_conv2d_hwnc_tensorcore.py | 4 +- .../python/unittest/test_autotvm_ga_tuner.py | 89 +++++ .../unittest/test_autotvm_index_tuner.py | 77 +++- tests/python/unittest/test_autotvm_space.py | 167 ++++++++- 16 files changed, 758 insertions(+), 247 deletions(-) create mode 100644 tests/python/unittest/test_autotvm_ga_tuner.py diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py index 4d6b23162a25..22812f907bb3 100644 --- a/python/tvm/autotvm/task/space.py +++ b/python/tvm/autotvm/task/space.py @@ -30,6 +30,7 @@ import functools import math from collections import namedtuple, OrderedDict +from random import randrange import numpy as np from tvm.te import schedule, thread_axis @@ -665,6 +666,8 @@ def __init__(self): self.space_map = OrderedDict() # name -> space self._collect = True self._length = None + self._range_length = None + self._dims = None self._entity_map = OrderedDict() # name -> entity self._constraints = [] self.errors = [] @@ -672,6 +675,8 @@ def __init__(self): self.flop = 0 self.cost = None self.is_fallback = False + self._shared_filter = None + self._shared_filter_cache = None @staticmethod def axis(var): @@ -714,18 +719,19 @@ def define_split(self, name, axis, policy="factors", **kwargs): the total number of axis after split (`int`). ``no_tail``: should we only include divisible numbers as split factors (`bool`). - `candidate``: + ``candidate``: (policy=candidate) manual candidate list (`List`). Examples -------- >>> # use custom candidates - >>> cfg.define_split('tile_x', x, policy='candidate', candidate=[[1, 4, 4], [4, 1, 4]]) + >>> cfg.define_split('tile_x', x, policy='candidate', num_outputs=3, + >>> candidate=[[1, 4, 4], [4, 1, 4]]) >>> # use a filter that only accepts the split scheme whose inner most tile is less then 4 - >>> cfg.define_split('tile_y', y, policy='factors', filter=lambda x: x.size[-1] <= 4) + >>> cfg.define_split('tile_y', y, policy='factors', num_outputs=3, + >>> filter=lambda x: x.size[-1] <= 4) """ - axes = [axis] return self._add_new_transform(SplitSpace, name, axes, policy, **kwargs) @@ -822,11 +828,300 @@ def valid(self): """ return not bool(self.errors) + def is_index_valid(self, index): + """Checks if the index satisfies the multi_filter condition + + Parameters + ---------- + index: int + index from the range of the space + + Returns + ------- + valid: bool + whether the index meets all the constraints + """ + assert 0 <= index < self.range_length + if self._shared_filter is None: + return True + if self._shared_filter_cache is None: + self._make_shared_filter_cache() + return self._shared_filter_cache[index] + + def multi_filter(self, filter): # pylint: disable=redefined-builtin + """The filter can restrict combination of parameters in difference to the knob filter, + that restricts only single parameter + + Parameters + ---------- + filter: function + predicate with one argument (Callable[[int], bool]) + + .. note:: + + Using this filter causes additional restrictions on the use of __len__. + Normally, it define the count of valid indexes and the range of space, but when + multi_filter enabled, it requires to use __len__ for getting the count of valid + indexes or range_length for the range of space. It is recommended to use: + ``is_index_valid``, ``get_next_index``, ``get_rand_index`` to bypass the space + + Examples + -------- + >>> # Pre-requisites + >>> candidates = [[16, 64], [32, 32], [64, 16]] + >>> filter = lambda v: v.size[0] != 16 + >>> multi_filter = lambda e: (e["tile_x"].size[0] + e["tile_y"].size[0]) <= 64 + + >>> # Case 1 - without filtering + >>> cfg.define_split("tile_x", x, num_outputs=2, policy="candidate", candidate=candidates) + >>> cfg.define_split("tile_y", y, num_outputs=2, policy="candidate", candidate=candidates) + >>> # [('tile_x', [16, 64]), ('tile_y', [16, 64])],None,0 + >>> # [('tile_x', [32, 32]), ('tile_y', [16, 64])],None,1 + >>> # [('tile_x', [64, 16]), ('tile_y', [16, 64])],None,2 + >>> # [('tile_x', [16, 64]), ('tile_y', [32, 32])],None,3 + >>> # [('tile_x', [32, 32]), ('tile_y', [32, 32])],None,4 + >>> # [('tile_x', [64, 16]), ('tile_y', [32, 32])],None,5 + >>> # [('tile_x', [16, 64]), ('tile_y', [64, 16])],None,6 + >>> # [('tile_x', [32, 32]), ('tile_y', [64, 16])],None,7 + >>> # [('tile_x', [64, 16]), ('tile_y', [64, 16])],None,8 + + >>> # Case 2 - with filter + >>> cfg.define_split("tile_x", x, num_outputs=2, policy="candidate", candidate=candidates, + >>> filter=filter) + >>> cfg.define_split("tile_y", y, num_outputs=2, policy="candidate", candidate=candidates, + >>> filter=filter) + >>> # [('tile_x', [32, 32]), ('tile_y', [32, 32])],None,0 + >>> # [('tile_x', [64, 16]), ('tile_y', [32, 32])],None,1 + >>> # [('tile_x', [32, 32]), ('tile_y', [64, 16])],None,2 + >>> # [('tile_x', [64, 16]), ('tile_y', [64, 16])],None,3 + + >>> # Case 3 - with filter and multi_filter + >>> cfg.define_split("tile_x", x, num_outputs=2, policy="candidate", candidate=candidates, + >>> filter=filter) + >>> cfg.define_split("tile_y", y, num_outputs=2, policy="candidate", candidate=candidates, + >>> filter=filter) + >>> cfg.multi_filter(filter=multi_filter) + >>> # [('tile_x', [32, 32]), ('tile_y', [32, 32])],None,0 + """ + if self._collect: + self.clear_cache() + self._shared_filter = filter + + @property + def range_length(self): + """Length of the index range in the space""" + if self._range_length is None: + self._range_length = int(np.prod([len(x) for x in self.space_map.values()])) + return self._range_length + + @property + def dims(self): + """Dimensions in the space""" + if self._dims is None: + self._dims = [len(x) for x in self.space_map.values()] + return self._dims + + def subrange_length(self, start, end): + """Returns the number of valid indexes within the limited range from [start, end] + + Parameters + ---------- + start: int + start of subrange, inclusive + end: int + end of subrange, exclusive + + Returns + ------- + count: int + number of valid indexes + """ + assert 0 <= start <= end <= self.range_length + if self._shared_filter is None: + return end - start + if self._shared_filter_cache is None: + self._make_shared_filter_cache() + return self._shared_filter_cache[start:end].count(True) + + def get_rand_index(self, start=None, end=None, to_exclude=None): + """Returns a random valid index unlisted to exclusion + + Parameters + ---------- + start: int, optional + specifying at which position to start, inclusive + end: int, optional + specifying at which position to end, exclusive + to_exclude: list, optional + determines unsuitable values + + Returns + ------- + rand: int + random index in the space + + .. note:: + + Excluding all valid space indexes will lead to an infinite loop. + + """ + start = start or 0 + end = end or self.range_length + while True: + index = randrange(start, end) + if self.is_index_valid(index) and index not in (to_exclude or []): + return index + + def get_next_index(self, index, n=1, start=None, end=None): + """Returns the nth valid next index or None if out of range + + Parameters + ---------- + index: int + specifying at which position to start, inclusive + n: int, optional + step by using to find the next index, for the opposite + direction a negative number should be used + start: list, optional + start of subrange, inclusive + end: list, optional + end of subrange, exclusive + + Returns + ------- + next: int + next index in the space + """ + assert n != 0 + start = start or 0 + end = end or self.range_length + if self._shared_filter is None: + index += n + if start <= index < end: + return index + return None + trend = 1 if n > 0 else -1 + counter = abs(n) + while counter != 0: + index += trend + if index < start or index >= end: + return None + if self.is_index_valid(index): + counter -= 1 + return index + + def clear_cache(self): + """Clears the cache of index validity""" + del self._shared_filter_cache + self._dims = None + self._length = None + self._range_length = None + self._shared_filter_cache = None + + def _make_shared_filter_cache(self): + def apply(t): + entities = OrderedDict() + for name, space in self.space_map.items(): + entities[name] = space[t % len(space)] + t //= len(space) + return bool(self._shared_filter(entities)) + + self._shared_filter_cache = tuple(apply(i) for i in range(self.range_length)) + self._length = self._shared_filter_cache.count(True) + + def point2knob(self, point): + """Convert point form (single integer) to knob (vector) + + Parameters + ---------- + point: int + point to convert + + Returns + ------- + knob: list + knob representation of the point + """ + knob = [] + for dim in self.dims: + knob.append(point % dim) + point //= dim + return knob + + def knob2point(self, knob): + """Convert knob form (vector) to point form (single integer) + + Parameters + ---------- + knob: list + knob to convert + + Returns + ------- + point: int + point of the knob representation + """ + point = 0 + for j, k in enumerate(knob): + point += int(np.prod(self.dims[:j])) * k + return point + + def sample_ints(self, m): + """ + Sample m different integer numbers from [0, self.range_length) without replacement + This function is an alternative of `np.random.choice` when self.range_length > 2 ^ 32, in + which case numpy does not work. + + Parameters + ---------- + m: int + The number of sampled int + + Returns + ------- + ints: an numpy array of size m + """ + assert m <= len(self) + vis = set() + while len(vis) < m: + new = randrange(0, self.range_length) + if self.is_index_valid(new): + vis.add(new) + return np.fromiter(vis, int, len(vis)) + + def random_walk(self, point): + """random walk as local transition + + Parameters + ---------- + point: int + index of the ConfigEntity + + Returns + ------- + new_point: int + new neighborhood index + """ + # transform to knob form + old_knob = self.point2knob(point) + new_knob = old_knob.copy() + new_point = self.knob2point(new_knob) + # mutate + while new_knob == old_knob or not self.is_index_valid(new_point): + from_i = np.random.randint(len(old_knob)) + to_v = np.random.randint(self.dims[from_i]) + new_knob[from_i] = to_v + new_point = self.knob2point(new_knob) + # transform to index form + return new_point + def _add_new_transform(self, space_class, name, axes, policy, **kwargs): """Add a new transform space in template""" # if we do not have tuned info (_collect == True) but defined KNOB value # for "default" scheduling before call of _add_new_transform, in this case # no need to create new space and override previously pointed KNOB values + if kwargs.get("filter"): + self.clear_cache() if self._collect and not (self.is_fallback and name in self._entity_map): # convert schedule axis to space definition axis axes = [x if isinstance(x, (VirtualAxis, Axis)) else self.axis(x) for x in axes] @@ -839,8 +1134,11 @@ def _add_new_transform(self, space_class, name, axes, policy, **kwargs): return [Axis(None, i) for i in range(space_class.get_num_output(axes, policy, **kwargs))] def __len__(self): - if self._length is None: - self._length = int(np.prod([len(x) for x in self.space_map.values()])) + """Returns the number of valid indexes in the space""" + if self._shared_filter is None: + return self.range_length + if self._shared_filter_cache is None: + self._make_shared_filter_cache() return self._length def get(self, index): @@ -850,9 +1148,21 @@ def get(self, index): ---------- index: int index in the space + + Returns + ------- + config: ConfigEntity + config corresponds to the index """ - if index < 0 or index >= len(self): - raise IndexError("Index out of range: size {}, got index {}".format(len(self), index)) + if index < 0 or index >= self.range_length: + raise IndexError( + "Index out of range: size {}, got index {}".format(self.range_length, index) + ) + if not self.is_index_valid(index): + raise IndexError( + "Index does not correspond to the multi-filter condition, got index {}. " + "Use is_index_valid to pre-check".format(index) + ) entities = OrderedDict() t = index for name, space in self.space_map.items(): @@ -876,7 +1186,9 @@ def __getitem__(self, name): return self._entity_map[name] def __repr__(self): - res = "ConfigSpace (len=%d, space_map=\n" % len(self) + res = "ConfigSpace (len={}, range_length={}, space_map=\n".format( + len(self), self.range_length + ) for i, (name, space) in enumerate(self.space_map.items()): res += " %2d %s: %s\n" % (i, name, space) return res + ")" diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py index 2ecd120e8504..ad5b87ac5d70 100644 --- a/python/tvm/autotvm/tuner/ga_tuner.py +++ b/python/tvm/autotvm/tuner/ga_tuner.py @@ -21,7 +21,6 @@ import numpy as np from .tuner import Tuner -from .model_based_tuner import knob2point, point2knob class GATuner(Tuner): @@ -49,41 +48,24 @@ def __init__(self, task, pop_size=100, elite_num=3, mutation_prob=0.1): assert elite_num <= pop_size, "The number of elites must be less than population size" - # space info - self.space = task.config_space - self.dim_keys = [] - self.dims = [] - for k, v in self.space.space_map.items(): - self.dim_keys.append(k) - self.dims.append(len(v)) - - self.visited = set([]) + # random initialization + self.pop_size = min(self.pop_size, len(self.space)) + self.elite_num = min(self.pop_size, self.elite_num) + self.visited = set(self.space.sample_ints(self.pop_size)) # current generation - self.genes = [] + self.genes = [self.space.point2knob(idx) for idx in self.visited] self.scores = [] self.elites = [] self.elite_scores = [] self.trial_pt = 0 - # random initialization - self.pop_size = min(self.pop_size, len(self.space)) - self.elite_num = min(self.pop_size, self.elite_num) - for _ in range(self.pop_size): - tmp_gene = point2knob(np.random.randint(len(self.space)), self.dims) - while knob2point(tmp_gene, self.dims) in self.visited: - tmp_gene = point2knob(np.random.randint(len(self.space)), self.dims) - - self.genes.append(tmp_gene) - self.visited.add(knob2point(tmp_gene, self.dims)) - def next_batch(self, batch_size): ret = [] - for _ in range(batch_size): + while len(ret) < batch_size and self.has_next(): gene = self.genes[self.trial_pt % self.pop_size] self.trial_pt += 1 - ret.append(self.space.get(knob2point(gene, self.dims))) - + ret.append(self.space.get(self.space.knob2point(gene))) return ret def update(self, inputs, results): @@ -95,47 +77,43 @@ def update(self, inputs, results): self.scores.append(0.0) if len(self.scores) >= len(self.genes) and len(self.visited) < len(self.space): - genes = self.genes + self.elites - scores = np.array(self.scores[: len(self.genes)] + self.elite_scores) - - # reserve elite - self.elites, self.elite_scores = [], [] - elite_indexes = np.argpartition(scores, -self.elite_num)[-self.elite_num :] - for ind in elite_indexes: - self.elites.append(genes[ind]) - self.elite_scores.append(scores[ind]) - - # cross over - indices = np.arange(len(genes)) - scores += 1e-8 - scores /= np.max(scores) - probs = scores / np.sum(scores) - tmp_genes = [] - for _ in range(self.pop_size): - p1, p2 = np.random.choice(indices, size=2, replace=False, p=probs) - p1, p2 = genes[p1], genes[p2] - point = np.random.randint(len(self.dims)) - tmp_gene = p1[:point] + p2[point:] - tmp_genes.append(tmp_gene) - - # mutation next_genes = [] - for tmp_gene in tmp_genes: - for j, dim in enumerate(self.dims): - if np.random.random() < self.mutation_prob: - tmp_gene[j] = np.random.randint(dim) - - if len(self.visited) < len(self.space): - while knob2point(tmp_gene, self.dims) in self.visited: - j = np.random.randint(len(self.dims)) - tmp_gene[j] = np.random.randint( - self.dims[j] # pylint: disable=invalid-sequence-index - ) - next_genes.append(tmp_gene) - self.visited.add(knob2point(tmp_gene, self.dims)) - else: - break - + # There is no reason to crossover or mutate since the size of the unvisited + # is no larger than the size of the population. + if len(self.space) - len(self.visited) <= self.pop_size: + for idx in range(self.space.range_length): + if self.space.is_index_valid(idx) and idx not in self.visited: + next_genes.append(self.space.point2knob(idx)) + self.visited.add(idx) + else: + genes = self.genes + self.elites + scores = np.array(self.scores[: len(self.genes)] + self.elite_scores) + + # reserve elite + self.elites, self.elite_scores = [], [] + elite_indexes = np.argpartition(scores, -self.elite_num)[-self.elite_num :] + for ind in elite_indexes: + self.elites.append(genes[ind]) + self.elite_scores.append(scores[ind]) + + indices = np.arange(len(genes)) + scores += 1e-8 + scores /= np.max(scores) + probs = scores / np.sum(scores) + while len(next_genes) < self.pop_size: + # cross over + p1, p2 = np.random.choice(indices, size=2, replace=False, p=probs) + p1, p2 = genes[p1], genes[p2] + point = np.random.randint(len(self.space.dims)) + tmp_gene = p1[:point] + p2[point:] + # mutation + for j, dim in enumerate(self.space.dims): + if np.random.random() < self.mutation_prob: + tmp_gene[j] = np.random.randint(dim) + + if self.space.is_index_valid(self.space.knob2point(tmp_gene)): + next_genes.append(tmp_gene) + self.visited.add(self.space.knob2point(tmp_gene)) self.genes = next_genes self.trial_pt = 0 self.scores = [] diff --git a/python/tvm/autotvm/tuner/index_based_tuner.py b/python/tvm/autotvm/tuner/index_based_tuner.py index 972de65154c9..881728bc9b34 100644 --- a/python/tvm/autotvm/tuner/index_based_tuner.py +++ b/python/tvm/autotvm/tuner/index_based_tuner.py @@ -17,8 +17,6 @@ # pylint: disable=abstract-method """Grid search tuner and random tuner""" -import numpy as np - from .tuner import Tuner @@ -32,7 +30,7 @@ class IndexBaseTuner(Tuner): The tuning task range_idx: Optional[Tuple[int, int]] - A tuple of index range that this tuner can select from + A tuple of index range that this tuner can select from [begin_idx, end_idx] """ def __init__(self, task, range_idx=None): @@ -41,17 +39,19 @@ def __init__(self, task, range_idx=None): range_idx, tuple ), "range_idx must be None or (int, int)" - self.range_length = len(self.task.config_space) - self.index_offset = 0 - if range_idx is not None: - assert range_idx[1] > range_idx[0], "Index range must be positive" - assert range_idx[0] >= 0, "Start index must be positive" - self.range_length = range_idx[1] - range_idx[0] + 1 - self.index_offset = range_idx[0] - self.counter = 0 + self.visited = [] + self.begin_idx, self.end_idx = range_idx or (0, self.space.range_length - 1) + assert self.begin_idx >= 0, "Start index must be positive" + self.end_idx += 1 # Further end_idx is exclusive + assert ( + self.end_idx <= self.space.range_length + ), "Finish index must be less the space range length " + self.range_length = self.end_idx - self.begin_idx + assert self.range_length > 0, "Index range must be positive" + self.visited_max = self.space.subrange_length(self.begin_idx, self.end_idx) def has_next(self): - return self.counter < self.range_length + return len(self.visited) < self.visited_max def load_history(self, data_set, min_seed_records=500): pass @@ -60,14 +60,23 @@ def load_history(self, data_set, min_seed_records=500): class GridSearchTuner(IndexBaseTuner): """Enumerate the search space in a grid search order""" + def __init__(self, task, range_idx=None): + super(GridSearchTuner, self).__init__(task, range_idx) + + self.index = self.begin_idx + if not self.space.is_index_valid(self.index): + self.index = self.space.get_next_index( + self.index, start=self.begin_idx, end=self.end_idx + ) + def next_batch(self, batch_size): ret = [] - for _ in range(batch_size): - if self.counter >= self.range_length: - break - index = self.counter + self.index_offset - ret.append(self.task.config_space.get(index)) - self.counter = self.counter + 1 + while len(ret) < batch_size and self.has_next(): + self.visited.append(self.index) + ret.append(self.space.get(self.index)) + self.index = self.space.get_next_index( + self.index, start=self.begin_idx, end=self.end_idx + ) return ret @@ -83,32 +92,10 @@ class RandomTuner(IndexBaseTuner): A tuple of index range to random """ - def __init__(self, task, range_idx=None): - super(RandomTuner, self).__init__(task, range_idx) - - # Use a dict to mimic a range(n) list without storing rand_state[i] = i entries so that - # we can generate non-repetitive random indices. - self.rand_state = {} - self.rand_max = self.range_length - self.visited = [] - def next_batch(self, batch_size): ret = [] - for _ in range(batch_size): - if self.rand_max == 0: - break - - # Random an indirect index. - index_ = np.random.randint(self.rand_max) - self.rand_max -= 1 - - # Use the indirect index to get a direct index. - index = self.rand_state.get(index_, index_) + self.index_offset - ret.append(self.task.config_space.get(index)) + while len(ret) < batch_size and self.has_next(): + index = self.space.get_rand_index(self.begin_idx, self.end_idx, to_exclude=self.visited) self.visited.append(index) - - # Update the direct index map. - self.rand_state[index_] = self.rand_state.get(self.rand_max, self.rand_max) - self.rand_state.pop(self.rand_max, None) - self.counter += 1 + ret.append(self.space.get(index)) return ret diff --git a/python/tvm/autotvm/tuner/model_based_tuner.py b/python/tvm/autotvm/tuner/model_based_tuner.py index f07e7fb4eb76..0841e9a76528 100644 --- a/python/tvm/autotvm/tuner/model_based_tuner.py +++ b/python/tvm/autotvm/tuner/model_based_tuner.py @@ -207,9 +207,6 @@ def __init__(self, task, cost_model, model_optimizer, plan_size, diversity_filte self.task = task self.target = task.target self.plan_size = plan_size - self.space = task.config_space - self.space_len = len(task.config_space) - self.dims = [len(x) for x in self.space.space_map.values()] self.cost_model = cost_model self.model_optimizer = model_optimizer @@ -233,29 +230,19 @@ def __init__(self, task, cost_model, model_optimizer, plan_size, diversity_filte def next_batch(self, batch_size): ret = [] - - counter = 0 - while counter < batch_size: - if len(self.visited) >= len(self.space): - break - + while len(ret) < batch_size and self.has_next(): while self.trial_pt < len(self.trials): index = self.trials[self.trial_pt] - if index not in self.visited: + if index not in self.visited and self.space.is_index_valid(index): break self.trial_pt += 1 if self.trial_pt >= len(self.trials) - int(0.05 * self.plan_size): # if the trial list is empty or # the tuner is doing the last 5% trials (e-greedy), choose randomly - index = np.random.randint(len(self.space)) - while index in self.visited: - index = np.random.randint(len(self.space)) - + index = self.space.get_rand_index(to_exclude=self.visited) ret.append(self.space.get(index)) self.visited.add(index) - - counter += 1 return ret def update(self, inputs, results): @@ -274,8 +261,8 @@ def update(self, inputs, results): # However, adding the index to visited again here enables us # to also use this update function to resume tuning progress in # case of interruption. + assert self.space.is_index_valid(index) self.visited.add(index) - # if we have enough new training samples if len(self.xs) >= self.plan_size * (self.train_ct + 1) and self.flops_max > 1e-6: self.cost_model.fit(self.xs, self.ys, self.plan_size) @@ -284,7 +271,7 @@ def update(self, inputs, results): self.cost_model, self.plan_size * self.diversity_filter_ratio, self.visited ) scores = self.cost_model.predict(candidate) - knobs = [point2knob(x, self.dims) for x in candidate] + knobs = [self.space.point2knob(x) for x in candidate] pick_index = submodular_pick(0 * scores, knobs, self.plan_size, knob_weight=1) maximums = np.array(candidate)[pick_index] else: @@ -322,23 +309,6 @@ def has_next(self): return len(self.visited) < len(self.space) -def point2knob(p, dims): - """convert point form (single integer) to knob form (vector)""" - knob = [] - for dim in dims: - knob.append(p % dim) - p //= dim - return knob - - -def knob2point(knob, dims): - """convert knob form (vector) to point form (single integer)""" - p = 0 - for j, k in enumerate(knob): - p += int(np.prod(dims[:j])) * k - return p - - def submodular_pick(scores, knobs, n_pick, knob_weight=1.0): """Run greedy optimization to pick points with regard to both score and diversity. DiversityScore = knob_weight * number of unique knobs in the selected set diff --git a/python/tvm/autotvm/tuner/sa_model_optimizer.py b/python/tvm/autotvm/tuner/sa_model_optimizer.py index 401eda8c276f..a50f148f2eb2 100644 --- a/python/tvm/autotvm/tuner/sa_model_optimizer.py +++ b/python/tvm/autotvm/tuner/sa_model_optimizer.py @@ -25,8 +25,7 @@ import numpy as np -from ..utils import sample_ints -from .model_based_tuner import ModelOptimizer, knob2point, point2knob +from .model_based_tuner import ModelOptimizer logger = logging.getLogger("autotvm") @@ -60,10 +59,7 @@ def __init__( log_interval=50, ): super(SimulatedAnnealingOptimizer, self).__init__() - self.task = task - self.dims = [len(x) for x in self.task.config_space.space_map.values()] - self.n_iter = n_iter self.temp = temp self.persistent = persistent @@ -84,7 +80,7 @@ def find_maximums(self, model, num, exclusive): if self.persistent and self.points is not None: points = self.points else: - points = np.array(sample_ints(0, len(self.task.config_space), self.parallel_size)) + points = self.task.config_space.sample_ints(self.parallel_size) scores = model.predict(points) @@ -113,7 +109,7 @@ def find_maximums(self, model, num, exclusive): while k < n_iter and k < k_last_modify + early_stop: new_points = np.empty_like(points) for i, p in enumerate(points): - new_points[i] = random_walk(p, self.dims) + new_points[i] = self.task.config_space.random_walk(p) new_scores = model.predict(new_points) @@ -157,32 +153,3 @@ def find_maximums(self, model, num, exclusive): self.points = points return [x[1] for x in heap_items] - - -def random_walk(p, dims): - """random walk as local transition - - Parameters - ---------- - p: int - index of the ConfigEntity - dims: Array of int - sizes of each dimension - - Returns - ------- - new_p: int - new neighborhood index - """ - # transform to knob form - old = point2knob(p, dims) - new = list(old) - - # mutate - while new == old: - from_i = np.random.randint(len(old)) - to_v = np.random.randint(dims[from_i]) - new[from_i] = to_v - - # transform to index form - return knob2point(new, dims) diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py index 848265ce17ca..a758a5d4cd9c 100644 --- a/python/tvm/autotvm/tuner/tuner.py +++ b/python/tvm/autotvm/tuner/tuner.py @@ -43,6 +43,7 @@ def __init__(self, task, **kwargs): self.recorder = None self.task = task + self.space = self.task.config_space # keep the current best self.best_config = None diff --git a/python/tvm/autotvm/utils.py b/python/tvm/autotvm/utils.py index ec3f18daa6c9..75db5208adbe 100644 --- a/python/tvm/autotvm/utils.py +++ b/python/tvm/autotvm/utils.py @@ -19,8 +19,6 @@ import logging import time -from random import randrange - import numpy as np import tvm.arith from tvm.tir import expr @@ -57,36 +55,6 @@ def get_rank(values): return ranks -def sample_ints(low, high, m): - """ - Sample m different integer numbers from [low, high) without replacement - This function is an alternative of `np.random.choice` when (high - low) > 2 ^ 32, in - which case numpy does not work. - - Parameters - ---------- - low: int - low point of sample range - high: int - high point of sample range - m: int - The number of sampled int - - Returns - ------- - ints: an array of size m - """ - vis = set() - assert m <= high - low - while len(vis) < m: - new = randrange(low, high) - while new in vis: - new = randrange(low, high) - vis.add(new) - - return list(vis) - - def pool_map(func, args, batch_size, verbose=False, pool=None): """A wrapper of multiprocessing.pool.Pool.map to support small-batch mapping for large argument list. This can reduce memory usage diff --git a/python/tvm/topi/adreno/conv2d_nchw.py b/python/tvm/topi/adreno/conv2d_nchw.py index 082f71364af8..b1f229ebe5dc 100644 --- a/python/tvm/topi/adreno/conv2d_nchw.py +++ b/python/tvm/topi/adreno/conv2d_nchw.py @@ -260,7 +260,15 @@ def schedule_conv2d_NCHWc_KCRSk(cfg, s, output): cfg.define_split("tile_rx", rx, num_outputs=2) cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) - + cfg.multi_filter( + filter=lambda entity: ( # pylint: disable=chained-comparison + entity["tile_fc"].size[1] * entity["tile_y"].size[1] * entity["tile_x"].size[1] + ) + <= 24 + and 32 + <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * entity["tile_x"].size[2]) + < 1024 + ) if cfg.is_fallback: get_default_conv2d_config(cfg, conv.shape[1], conv.shape[2], conv.shape[3]) ##### space definition end ##### diff --git a/python/tvm/topi/adreno/conv2d_nhwc.py b/python/tvm/topi/adreno/conv2d_nhwc.py index 993b63252531..644978743b4d 100644 --- a/python/tvm/topi/adreno/conv2d_nhwc.py +++ b/python/tvm/topi/adreno/conv2d_nhwc.py @@ -258,7 +258,15 @@ def schedule_conv2d_NHWC(cfg, s, output): cfg.define_split("tile_rx", rx, num_outputs=2) cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) - + cfg.multi_filter( + filter=lambda entity: ( # pylint: disable=chained-comparison + entity["tile_fc"].size[1] * entity["tile_y"].size[1] * entity["tile_x"].size[1] + ) + <= 24 + and 32 + <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * entity["tile_x"].size[2]) + < 1024 + ) if cfg.is_fallback: get_default_conv2d_config(cfg, conv.shape[3], conv.shape[1], conv.shape[2]) ##### space definition end ##### diff --git a/python/tvm/topi/adreno/conv2d_winograd_common.py b/python/tvm/topi/adreno/conv2d_winograd_common.py index 501773ad46fa..8c62f11c2fe5 100644 --- a/python/tvm/topi/adreno/conv2d_winograd_common.py +++ b/python/tvm/topi/adreno/conv2d_winograd_common.py @@ -440,10 +440,9 @@ def schedule_conv2d_winograd(cfg, s, output, pre_computed): and entry.size[1] <= 16, ) cfg.define_split("tile_rc", rcc, num_outputs=2) - # TODO: Uncomment the following lines when multi_filter will be introduced - # cfg.multi_filter( - # filter=lambda entity: entity["tile_y"].size[2] * entity["tile_x"].size[2] in range(32,1024) - # ) + cfg.multi_filter( + filter=lambda entity: 32 <= (entity["tile_y"].size[2] * entity["tile_x"].size[2]) < 1024 + ) ##### space definition end ##### # batch gemm diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py index eb998bdbcd6e..8549399fb0d0 100644 --- a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py +++ b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py @@ -214,6 +214,15 @@ def schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, output): cfg.define_split("tile_rx", rx, num_outputs=2) cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) + cfg.multi_filter( + filter=lambda entity: ( # pylint: disable=chained-comparison + entity["tile_fc"].size[1] * entity["tile_y"].size[1] * entity["tile_x"].size[1] + ) + <= 32 + and 32 + <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * entity["tile_x"].size[2]) + < 1024 + ) if cfg.is_fallback: get_default_conv2d_config(cfg, conv.shape[1], conv.shape[2], conv.shape[3]) diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py index c27f2a9eae7c..82e128443e85 100644 --- a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py +++ b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py @@ -211,6 +211,15 @@ def schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, output): cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) cfg.define_knob("unroll_explicit", [0, 1]) + cfg.multi_filter( + filter=lambda entity: ( # pylint: disable=chained-comparison + entity["tile_fc"].size[1] * entity["tile_y"].size[1] * entity["tile_x"].size[1] + ) + <= 32 + and 32 + <= (entity["tile_fc"].size[2] * entity["tile_y"].size[2] * entity["tile_x"].size[2]) + < 1024 + ) if cfg.is_fallback: get_default_conv2d_config(cfg, conv.shape[3], conv.shape[1], conv.shape[2]) ##### space definition end ##### diff --git a/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py b/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py index 5448a54fae6b..1dbff816699e 100644 --- a/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py +++ b/tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py @@ -175,8 +175,8 @@ def get_mod(): space = task.config_space - idx1 = np.random.randint(len(space)) - idx2 = np.random.randint(len(space)) + idx1 = space.get_rand_index() + idx2 = space.get_rand_index() cfg = space.get(idx1) sch, arg_bufs = task.instantiate(cfg) diff --git a/tests/python/unittest/test_autotvm_ga_tuner.py b/tests/python/unittest/test_autotvm_ga_tuner.py new file mode 100644 index 000000000000..625c6c66b6f2 --- /dev/null +++ b/tests/python/unittest/test_autotvm_ga_tuner.py @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Test genetic algorithm tuner""" + +from tvm.testing.autotvm import DummyRunner, get_sample_task +from tvm import autotvm + + +def test_ga_tuner(): + """Test GATuner""" + # Test population size smaller than space size tuning configuration + task, _ = get_sample_task() + tuner = autotvm.tuner.GATuner(task, pop_size=32) + valid_indexes = list( + filter(lambda idx: tuner.space.is_index_valid(idx), range(tuner.space.range_length)) + ) + assert tuner.visited.issubset(valid_indexes) + assert tuner.pop_size == len(tuner.visited) == len(tuner.genes) + assert len(tuner.space) == 64 + + measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner()) + tuner.tune(n_trial=len(tuner.space), measure_option=measure_option) + assert tuner.visited.issubset(valid_indexes) + + # Test population size bigger than space size tuning configuration + task, _ = get_sample_task() + tuner = autotvm.tuner.GATuner(task, pop_size=100) + valid_indexes = list( + filter(lambda idx: tuner.space.is_index_valid(idx), range(tuner.space.range_length)) + ) + assert tuner.visited.issubset(valid_indexes) + assert tuner.pop_size == len(tuner.visited) == len(tuner.genes) + assert len(tuner.space) == 64 + + measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner()) + tuner.tune(n_trial=len(tuner.space), measure_option=measure_option) + assert tuner.visited.issubset(valid_indexes) + + # Test population size smaller than multi-filtered space size tuning configuration + task, _ = get_sample_task() + task.config_space.multi_filter( + filter=lambda entity: 8 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024 + ) + tuner = autotvm.tuner.GATuner(task, pop_size=32) + valid_indexes = list( + filter(lambda idx: tuner.space.is_index_valid(idx), range(tuner.space.range_length)) + ) + assert tuner.visited.issubset(valid_indexes) + assert tuner.pop_size == len(tuner.visited) == len(tuner.genes) + assert len(tuner.space) == 43 + + measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner()) + tuner.tune(n_trial=len(tuner.space), measure_option=measure_option) + assert tuner.visited.issubset(valid_indexes) + + # Test population size bigger than multi-filtered space size tuning configuration + task, _ = get_sample_task() + task.config_space.multi_filter( + filter=lambda entity: 8 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024 + ) + tuner = autotvm.tuner.GATuner(task, pop_size=100) + valid_indexes = list( + filter(lambda idx: tuner.space.is_index_valid(idx), range(tuner.space.range_length)) + ) + assert tuner.visited.issubset(valid_indexes) + assert tuner.pop_size == len(tuner.visited) == len(tuner.genes) + assert len(tuner.space) == 43 + + measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner()) + tuner.tune(n_trial=len(tuner.space), measure_option=measure_option) + assert tuner.visited.issubset(valid_indexes) + + +if __name__ == "__main__": + test_ga_tuner() diff --git a/tests/python/unittest/test_autotvm_index_tuner.py b/tests/python/unittest/test_autotvm_index_tuner.py index be89ee2506fc..514577f1c986 100644 --- a/tests/python/unittest/test_autotvm_index_tuner.py +++ b/tests/python/unittest/test_autotvm_index_tuner.py @@ -19,10 +19,9 @@ import multiprocessing from tvm.testing.autotvm import DummyRunner, get_sample_task from tvm import autotvm -from tvm.autotvm.tuner import GridSearchTuner, RandomTuner -def test_gridsearch_tuner(): +def test_grid_search_tuner(): """Test GridSearchTuner""" task, _ = get_sample_task() @@ -30,28 +29,60 @@ def test_gridsearch_tuner(): # When no range index, range_length should be the length of config space tuner = autotvm.tuner.GridSearchTuner(task) - assert tuner.range_length == len(task.config_space) - assert tuner.index_offset == 0 + assert tuner.begin_idx == 0 + assert tuner.end_idx == 64 + assert tuner.index == 0 + assert tuner.range_length == 64 + assert tuner.visited_max == 64 # With range index, range_length should be the length of the specified range tuner = autotvm.tuner.GridSearchTuner(task, range_idx=(8, 15)) + assert tuner.begin_idx == 8 + assert tuner.end_idx == 16 + assert tuner.index == 8 assert tuner.range_length == 8 - assert tuner.index_offset == 8 + assert tuner.visited_max == 8 # Tuner should only focus on the specified range tuner.tune(n_trial=8, measure_option=measure_option) - assert tuner.counter == 8 + assert len(tuner.visited) == 8 + assert not tuner.has_next() + + # With multi-filter + task, _ = get_sample_task() + task.config_space.multi_filter( + filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024 + ) + + tuner = autotvm.tuner.GridSearchTuner(task) + assert tuner.begin_idx == 0 + assert tuner.end_idx == 64 + assert tuner.index == 5 + assert tuner.range_length == 64 + assert tuner.visited_max == 34 + + # With range index, range_length should be the length of the specified range + tuner = autotvm.tuner.GridSearchTuner(task, range_idx=(8, 15)) + assert tuner.begin_idx == 8 + assert tuner.end_idx == 16 + assert tuner.index == 12 + assert tuner.range_length == 8 + assert tuner.visited_max == 4 + + # Tuner should only focus on the specified range + tuner.tune(n_trial=8, measure_option=measure_option) + assert len(tuner.visited) == 4 assert not tuner.has_next() def grid_search_spawn(): assert multiprocessing.get_spawn_method(False) == "spawn" - test_gridsearch_tuner() + test_grid_search_tuner() def test_grid_search_tuner_spawn(): ctx = multiprocessing.get_context("spawn") - p = ctx.Process(target=test_gridsearch_tuner) + p = ctx.Process(target=test_grid_search_tuner) p.start() p.join() @@ -63,20 +94,38 @@ def test_random_tuner(): measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner()) tuner = autotvm.tuner.RandomTuner(task, range_idx=(8, 15)) + assert tuner.begin_idx == 8 + assert tuner.end_idx == 16 + assert tuner.range_length == 8 + assert tuner.visited_max == 8 + + # Tuner should only focus on the specified range and should visit all indices + tuner.tune(n_trial=8, measure_option=measure_option) + assert len(tuner.visited) == 8 + assert not tuner.has_next() + for idx in tuner.visited: + assert 8 <= idx <= 15 + + # With multi-filter + task, _ = get_sample_task() + task.config_space.multi_filter( + filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024 + ) + tuner = autotvm.tuner.RandomTuner(task, range_idx=(8, 15)) + assert tuner.begin_idx == 8 + assert tuner.end_idx == 16 assert tuner.range_length == 8 - assert tuner.index_offset == 8 + assert tuner.visited_max == 4 # Tuner should only focus on the specified range and should visit all indices tuner.tune(n_trial=8, measure_option=measure_option) - assert tuner.counter == 8 + assert len(tuner.visited) == 4 assert not tuner.has_next() - visited = set() for idx in tuner.visited: - assert idx not in visited assert 8 <= idx <= 15 if __name__ == "__main__": - test_gridsearch_tuner() - test_gridsearch_tuner_spawn() + test_grid_search_tuner() + test_grid_search_tuner_spawn() test_random_tuner() diff --git a/tests/python/unittest/test_autotvm_space.py b/tests/python/unittest/test_autotvm_space.py index d9f2b528e429..eb783a9f8bcd 100644 --- a/tests/python/unittest/test_autotvm_space.py +++ b/tests/python/unittest/test_autotvm_space.py @@ -16,12 +16,11 @@ # under the License. """Test space definition primitives""" -import tvm from tvm import te from tvm.autotvm.task.space import ConfigSpace, FallbackConfigEntity -def gemm_func(cfg, N): +def gemm_func(cfg, N, filter_y=None, filter_x=None): A = te.placeholder((N, N), name="A") B = te.placeholder((N, N), name="B") @@ -32,8 +31,8 @@ def gemm_func(cfg, N): y, x = s[C].op.axis - cfg.define_split("tile_y", cfg.axis(y), num_outputs=2) - cfg.define_split("tile_x", cfg.axis(x), num_outputs=2) + cfg.define_split("tile_y", cfg.axis(y), num_outputs=2, filter=filter_y) + cfg.define_split("tile_x", cfg.axis(x), num_outputs=2, filter=filter_x) return s, [A, B, C] @@ -42,7 +41,7 @@ def test_split(): cfg = ConfigSpace() gemm_func(cfg, 128) - assert len(cfg) == 64 + assert cfg.range_length == 64 assert len(cfg.space_map["tile_y"]) == 8 # test policy @@ -102,5 +101,163 @@ def count4(n): pass +def _raises_exception(f): + try: + f() + except Exception: + return True + return False + + +def test_multi_filter(): + # create config without multi_filter + cfg = ConfigSpace() + gemm_func(cfg, 128) + # create config with multi_filter + cfg_mf = ConfigSpace() + gemm_func(cfg_mf, 128) + cfg_mf.multi_filter( + filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024 + ) + # test len + assert len(cfg) == 64 + assert len(cfg_mf) == 34 + # test range_length + assert cfg.range_length == 64 + assert cfg_mf.range_length == 64 + # test dims + assert cfg.dims == [8, 8] + assert cfg_mf.dims == [8, 8] + # test is_index_valid + assert cfg.is_index_valid(0) is True + assert cfg.is_index_valid(15) is True + assert cfg_mf.is_index_valid(0) is False + assert cfg_mf.is_index_valid(15) is True + # test get + assert _raises_exception(lambda: cfg.get(0)) is False + assert _raises_exception(lambda: cfg.get(15)) is False + assert _raises_exception(lambda: cfg_mf.get(0)) is True + assert _raises_exception(lambda: cfg_mf.get(15)) is False + # test subrange_length + assert cfg.subrange_length(0, 64) == 64 + assert cfg.subrange_length(0, 32) == 32 + assert cfg.subrange_length(16, 32) == 16 + assert cfg.subrange_length(16, 16) == 0 + assert _raises_exception(lambda: cfg.subrange_length(0, 128)) + assert _raises_exception(lambda: cfg.subrange_length(-64, 64)) + assert _raises_exception(lambda: cfg.subrange_length(64, 0)) + assert cfg_mf.subrange_length(0, 64) == 34 + assert cfg_mf.subrange_length(0, 32) == 17 + assert cfg_mf.subrange_length(16, 32) == 10 + assert cfg_mf.subrange_length(16, 16) == 0 + assert _raises_exception(lambda: cfg_mf.subrange_length(0, 128)) + assert _raises_exception(lambda: cfg_mf.subrange_length(-64, 64)) + assert _raises_exception(lambda: cfg_mf.subrange_length(64, 0)) + # test point2knob + assert cfg.point2knob(0) == [0, 0] + assert cfg.point2knob(4) == [4, 0] + assert cfg.point2knob(8) == [0, 1] + assert cfg.point2knob(12) == [4, 1] + assert cfg_mf.point2knob(0) == [0, 0] + assert cfg_mf.point2knob(4) == [4, 0] + assert cfg_mf.point2knob(8) == [0, 1] + assert cfg_mf.point2knob(12) == [4, 1] + # test knob2point + assert cfg.knob2point([0, 0]) == 0 + assert cfg.knob2point([4, 0]) == 4 + assert cfg.knob2point([0, 1]) == 8 + assert cfg.knob2point([4, 1]) == 12 + assert cfg_mf.knob2point([0, 0]) == 0 + assert cfg_mf.knob2point([4, 0]) == 4 + assert cfg_mf.knob2point([0, 1]) == 8 + assert cfg_mf.knob2point([4, 1]) == 12 + # get_rand_index + cfg_valid_indexes = list(filter(lambda idx: cfg.is_index_valid(idx), range(cfg.range_length))) + assert cfg.get_rand_index() in cfg_valid_indexes + assert cfg.get_rand_index(start=15, end=16) == 15 + assert 10 <= cfg.get_rand_index(start=10, end=20) < 20 + assert cfg.get_rand_index(to_exclude=cfg_valid_indexes[:-1]) == cfg_valid_indexes[-1:][0] + cfg_mf_valid_indexes = list( + filter(lambda idx: cfg_mf.is_index_valid(idx), range(cfg_mf.range_length)) + ) + assert cfg_mf.get_rand_index() in cfg_mf_valid_indexes + assert cfg_mf.get_rand_index(start=15, end=16) == 15 + assert 10 <= cfg_mf.get_rand_index(start=10, end=20) < 20 + assert ( + cfg_mf.get_rand_index(to_exclude=cfg_mf_valid_indexes[:-1]) == cfg_mf_valid_indexes[-1:][0] + ) + # get_next_index + assert cfg.get_next_index(0) == 1 + assert cfg.get_next_index(0, 1) == 1 + assert cfg.get_next_index(0, 2) == 2 + assert cfg.get_next_index(0, -1) is None + assert cfg.get_next_index(0, -2) is None + assert cfg.get_next_index(63) is None + assert cfg.get_next_index(63, 1) is None + assert cfg.get_next_index(63, 2) is None + assert cfg.get_next_index(63, -1) == 62 + assert cfg.get_next_index(63, -2) == 61 + assert cfg.get_next_index(60, 1, end=63) == 61 + assert cfg.get_next_index(63, -1, start=60) == 62 + assert cfg_mf.get_next_index(0) == 5 + assert cfg_mf.get_next_index(0, 1) == 5 + assert cfg_mf.get_next_index(0, 2) == 6 + assert cfg_mf.get_next_index(0, -1) is None + assert cfg_mf.get_next_index(0, -2) is None + assert cfg_mf.get_next_index(63) is None + assert cfg_mf.get_next_index(63, 1) is None + assert cfg_mf.get_next_index(63, 2) is None + assert cfg_mf.get_next_index(63, -1) == 58 + assert cfg_mf.get_next_index(63, -2) == 57 + assert cfg_mf.get_next_index(60, 1, end=63) is None + assert cfg_mf.get_next_index(63, -1, start=60) is None + # test sample_ints + cfg_ints = cfg.sample_ints(5) + assert len(cfg_ints) == 5 + assert set(cfg_ints).issubset(cfg_valid_indexes) + cfg_mf_ints = cfg_mf.sample_ints(5) + assert len(cfg_mf_ints) == 5 + assert set(cfg_mf_ints).issubset(cfg_mf_valid_indexes) + # test random_walk + cfg_walk = cfg.random_walk(15) + assert cfg_walk != 15 + assert cfg_walk in cfg_valid_indexes + cfg_mf_walk = cfg_mf.random_walk(15) + assert cfg_mf_walk != 15 + assert cfg_mf_walk in cfg_mf_valid_indexes + + +def test_filter_and_multi_filter(): + # test the order: filter -> multi_filter + cfg = ConfigSpace() + gemm_func(cfg, 128, filter_y=lambda y: y.size[-1] < 64) + # after adding filter + assert len(cfg) == 48 + assert cfg.range_length == 48 + cfg.multi_filter( + filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024 + ) + # after adding multi_filter + assert len(cfg) == 27 + assert cfg.range_length == 48 + + # test the order: multi_filter -> filter + cfg = ConfigSpace() + s, (A, B, C) = gemm_func(cfg, 128, filter_y=None) + cfg.multi_filter( + filter=lambda entity: 32 <= (entity["tile_x"].size[1] * entity["tile_y"].size[1]) < 1024 + ) + # after adding multi_filter + assert len(cfg) == 34 + assert cfg.range_length == 64 + y, x = s[C].op.axis + cfg.define_split("tile_y", cfg.axis(y), num_outputs=2, filter=lambda y: y.size[-1] < 64) + # after adding filter + assert len(cfg) == 27 + assert cfg.range_length == 48 + + if __name__ == "__main__": test_split() + test_multi_filter() + test_filter_and_multi_filter() From 195ae72b5c6f0df68fac41f7808d125d155a6345 Mon Sep 17 00:00:00 2001 From: masahi Date: Fri, 23 Sep 2022 03:26:05 +0900 Subject: [PATCH 228/704] [TOPI] Fix dtype legalize logic for CPU dot product instruction (#12865) The logic in `python/tvm/topi/generic/conv2d.py#L480-L499` is supposed to legalize the input dtype to be able to apply target-specific intrinsics that only support one of int8 or uint8. For example, the x86 VNNI instruction only supports uint8 activation. But the logic is incorrect (two cases are flipped) and leads to incorrect result in the following case: * The input activation is int8, and we want to use the x86 VNNI intrinsic which only supports uint8 activations. * The input activation is uint8, and we want to use the ARM `sdot` intrinsic which only supports int8 activations. The first case also applies to the Hexagon `vrmpy` intrinsic. I found this bug while testing `vrmpy` conv2d on int8 input. To test this on CI, we need to be running on a cascadelake or ARM v8.2 (with dot product support) instance. I cannot find a way to detect such cpu feature from a python script. `try / catch` doesn't work because the error is raised from LLVM (`LLVM ERROR: Do not know how to split the result of this operator`) that I don't know how to catch. So for now the test is skipped. --- python/tvm/topi/generic/conv2d.py | 15 +++--- tests/python/relay/test_op_level2.py | 75 ++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 6 deletions(-) diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py index 1cb69d593d1a..48b2a2f97146 100644 --- a/python/tvm/topi/generic/conv2d.py +++ b/python/tvm/topi/generic/conv2d.py @@ -477,7 +477,7 @@ def conv2d_alter_int8_common( pt, pl, pb, pr = get_pad_tuple(padding, (kh, kw)) if data_tensor.dtype != data_dtype: - # How to convert data to int8 + # How to convert data to uint8 # Original --> C = A (conv) B # A and B are int8 # C = (A + 128 - 128) (conv) B @@ -485,18 +485,20 @@ def conv2d_alter_int8_common( # where A' = A + 128 # and 128 (conv) B is basically a reduce on CRS axis for weights. # - # How to convert data to uint8 + # How to convert data to int8 # C = (A - 128 + 128) (conv) B # C = (A' conv B) + 128 (conv) B # where A' = A - 128 - if data_dtype == "int8": - # shift data to int8 + if data_dtype == "uint8": + # shift data to uint8 before_shift = relay.add after_shift = relay.subtract + pad_value = 128 else: - # shift data to uint8 + # shift data to int8 before_shift = relay.subtract after_shift = relay.add + pad_value = -128 if attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO": adjust_shift = relay.sum(relay.cast(kernel, dtype="int32"), axis=(0, 1, 2)) @@ -514,7 +516,8 @@ def conv2d_alter_int8_common( # Do external padding as pad value has to be 128. if any(padding): - data = relay.nn.pad(data, pad_width=pad_width, pad_value=128) + data = relay.nn.pad(data, pad_width=pad_width, pad_value=pad_value) + new_attrs["padding"] = (0, 0) # Multiply 128 to adjust shift. diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py index 84b72e4cffd2..6a895aaf0518 100644 --- a/tests/python/relay/test_op_level2.py +++ b/tests/python/relay/test_op_level2.py @@ -2137,5 +2137,80 @@ def get_subgraph(dtype): np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5) +@pytest.mark.skip("Requires cascadelake or ARM v8.2") +def test_conv2d_int8_alter_dtype(): + def get_conv2d_nchw( + d_shape, + w_shape, + data_dtype, + ): + out_dtype = "int32" + strides = (1, 1) + padding = (1, 1) + data = relay.var("data", shape=d_shape, dtype=data_dtype) + weight = relay.var("weight", shape=w_shape, dtype="int8") + out_channel = w_shape[0] + return relay.nn.conv2d( + data=data, + weight=weight, + kernel_size=w_shape[2:], + channels=out_channel, + padding=padding, + strides=strides, + out_dtype=out_dtype, + ) + + I, O, H, W = 64, 64, 56, 56 + kH = kW = 3 + + data_shape = (1, I, H, W) + weight_shape = (O, I, kH, kW) + bias_shape = (1, weight_shape[0], 1, 1) + + bias = relay.var("bias", shape=bias_shape, dtype="int32") + bias_np = np.random.randint(low=-127, high=128, size=bias_shape).astype("int32") + weight_np = np.random.uniform(-128, 127, size=weight_shape).astype("int8") + + for data_dtype, target, dot_product_instr in [ + ("uint8", "llvm --device arm_cpu -mattr=+v8.2a,+dotprod", "sdot"), + ("int8", "llvm -mcpu=cascadelake", "vpdpbusd"), + ]: + conv2d = get_conv2d_nchw(data_shape, weight_shape, data_dtype) + bias_add = relay.add(conv2d, bias) + mod = tvm.IRModule.from_expr(bias_add) + + if data_dtype == "uint8": + data_np = np.random.uniform(0, 255, size=data_shape).astype("uint8") + else: + data_np = np.random.uniform(-128, 127, size=data_shape).astype("int8") + + params = {"weight": weight_np, "bias": bias_np} + + ref = ( + relay.create_executor("graph", mod=mod, device=tvm.cpu(0), target="llvm") + .evaluate()(*[data_np, weight_np, bias_np]) + .numpy() + ) + + dev = tvm.cpu(0) + + with tvm.transform.PassContext( + opt_level=3, + ): + lib = relay.build(mod, target=target, params=params) + + assert dot_product_instr in lib.lib.get_source("asm") + + rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) + + rt_mod.set_input("data", data_np) + + rt_mod.run() + + out = rt_mod.get_output(0).numpy() + + np.testing.assert_equal(out, ref) + + if __name__ == "__main__": tvm.testing.main() From 86f9580498e7d4b5c826e7ae55b05f2a4e35a95c Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 22 Sep 2022 13:19:14 -0700 Subject: [PATCH 229/704] [Relay] Fix handling of TransfromLayout in TE compiler cache (#12874) --- src/relay/backend/te_compiler_cache.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc index a8eb6a58105f..17eac443ffe3 100644 --- a/src/relay/backend/te_compiler_cache.cc +++ b/src/relay/backend/te_compiler_cache.cc @@ -374,7 +374,7 @@ class ScheduleBuilder : public ExprVisitor { TuningRecord record = opt_record.value(); for (const Instruction& inst : record->trace->insts) { if (inst->kind.same_as(kind_transform_layout)) { - ICHECK_EQ(inst->attrs.size(), 3); + ICHECK_EQ(inst->attrs.size(), 4); MetaScheduleLayoutRewriter::LayoutQueuePush(Downcast(inst->attrs[2])); } } From 4e783a6087fd236c588cde30e0ac99daa15afe61 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 22 Sep 2022 13:20:40 -0700 Subject: [PATCH 230/704] [TOPI] Add layer norm operator (#12864) * [TOPI] Add one-pass layer norm using tuple reduction * Add reducer pattern for LowerCrossThreadReduction * lint * update docs --- include/tvm/topi/nn/layer_norm.h | 117 ++++++++++++++ include/tvm/topi/reduction.h | 23 +++ python/tvm/topi/nn/__init__.py | 1 + python/tvm/topi/nn/layer_norm.py | 46 ++++++ python/tvm/topi/testing/__init__.py | 1 + python/tvm/topi/testing/layer_norm_python.py | 53 +++++++ src/tir/schedule/primitive/reduction.cc | 9 ++ src/topi/nn.cc | 6 + .../topi/python/test_topi_layer_norm.py | 62 ++++++++ ..._transform_lower_cross_thread_reduction.py | 149 ++++++++++++++++++ 10 files changed, 467 insertions(+) create mode 100644 include/tvm/topi/nn/layer_norm.h create mode 100644 python/tvm/topi/nn/layer_norm.py create mode 100644 python/tvm/topi/testing/layer_norm_python.py create mode 100644 tests/python/topi/python/test_topi_layer_norm.py diff --git a/include/tvm/topi/nn/layer_norm.h b/include/tvm/topi/nn/layer_norm.h new file mode 100644 index 000000000000..93e5582ef184 --- /dev/null +++ b/include/tvm/topi/nn/layer_norm.h @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \brief layer normalization op constructions + * \file nn/layer_norm.h + */ +#ifndef TVM_TOPI_NN_LAYER_NORM_H_ +#define TVM_TOPI_NN_LAYER_NORM_H_ + +#include +#include + +#include + +namespace tvm { +namespace topi { +namespace nn { + +using namespace tvm::te; + +/*! + * \brief Layer normalization. + * \param data N-D tensor with shape [d_0, d_1, ..., d_{N-1}] + * \param gamma K-D tensor with shape [r_0, r_1, ..., r_{K-1}] where K == len(axis) and + * d_{axis_k} == r_k + * \param beta Optional, K-D tensor with shape [r_0, r_1, ..., r_{K-1}] where + * d_{axis_k} == r_k + * \param axis The axis to normalize over. + * \param epsilon The epsilon value to avoid division by zero. + * \param name The name of the operation. + * \param tag The tag to mark the operation. + * \return The normalized tensor, with the same shape as data. + */ +inline Tensor layer_norm(const Tensor& data, const Tensor& gamma, const Tensor& beta, + const Array& axis, double epsilon, + std::string name = "T_layer_norm", std::string tag = kInjective) { + // sum x and x^2 + auto ndim = data->shape.size(); + ICHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor"; + auto real_axis = GetRealAxis(static_cast(ndim), axis); + auto reduce_axes = MakeReduceAxes(real_axis, data); + auto target_shape = + MakeReduceTargetShape(real_axis, data, /*keepdims=*/false, /*atleast1d=*/true); + auto func = MakeTupleSumReducer(); + + auto compute = [ndim, &real_axis, &reduce_axes, &func, &data](const Array& indices) { + Array eval_range; + int arg_counter = 0; + int red_counter = 0; + + for (size_t i = 0; i < ndim; ++i) { + if (std::find(real_axis.begin(), real_axis.end(), i) != real_axis.end()) { + // real_axis contains i + eval_range.push_back(reduce_axes[red_counter]); + red_counter++; + } else { + eval_range.push_back(indices[arg_counter]); + arg_counter++; + } + } + auto square = [](const PrimExpr& x) { return x * x; }; + return func({data(eval_range), square(data(eval_range))}, reduce_axes, nullptr); + }; + + auto temp_x_x2 = + tvm::te::compute(target_shape, compute, data->op->name + "_red_temp", kCommReduce); + + auto temp_x = temp_x_x2[0]; + auto temp_x2 = temp_x_x2[1]; + + auto reduce_extent = make_const(data->dtype, 1); + for (int i : real_axis) { + reduce_extent *= data->shape[i]; + } + auto layer_norm_func = [&](const Array& indices) { + Array reduce_indices, non_reduce_indices; + for (int i = 0, n = static_cast(indices.size()); i < n; ++i) { + if (std::find(real_axis.begin(), real_axis.end(), i) != real_axis.end()) { + reduce_indices.push_back(indices[i]); + } else { + non_reduce_indices.push_back(indices[i]); + } + } + auto mean = temp_x(non_reduce_indices) / reduce_extent; + auto var = temp_x2(non_reduce_indices) / reduce_extent - mean * mean; + auto layer_norm = (data(indices) - mean) * tvm::rsqrt(var + make_const(var->dtype, epsilon)); + layer_norm = topi::multiply(layer_norm, gamma(reduce_indices)); + if (beta.defined()) { + layer_norm = topi::add(layer_norm, beta(reduce_indices)); + } + return layer_norm; + }; + return tvm::te::compute(data->shape, layer_norm_func, name, tag); +} + +} // namespace nn +} // namespace topi +} // namespace tvm + +#endif // TVM_TOPI_NN_LAYER_NORM_H_ diff --git a/include/tvm/topi/reduction.h b/include/tvm/topi/reduction.h index d4e420d80b02..5e79bd429d6f 100644 --- a/include/tvm/topi/reduction.h +++ b/include/tvm/topi/reduction.h @@ -570,6 +570,29 @@ inline Tensor prod(const Tensor& data, const Array& axis, bool keepdims return CommReduce(data, axis, ProdOp, keepdims, atleast1d); } +/*! + * \brief Create communitive reducer summing over tuples + */ +inline FCommReduce MakeTupleSumReducer() { + auto fcombine = [](Array lhs, Array rhs) { + Array result; + ICHECK_EQ(lhs.size(), rhs.size()); + result.reserve(lhs.size()); + for (size_t i = 0; i < lhs.size(); ++i) { + result.push_back(lhs[i] + rhs[i]); + } + return result; + }; + auto fidentity = [](std::vector types) { + Array result; + for (size_t i = 0; i < types.size(); ++i) { + result.push_back(tvm::tir::make_const(types[i], 0)); + } + return result; + }; + return MakeCommReducer(fcombine, fidentity, "tuple_sum"); +} + } // namespace topi } // namespace tvm #endif // TVM_TOPI_REDUCTION_H_ diff --git a/python/tvm/topi/nn/__init__.py b/python/tvm/topi/nn/__init__.py index 1dd922d76819..8f081242fa10 100644 --- a/python/tvm/topi/nn/__init__.py +++ b/python/tvm/topi/nn/__init__.py @@ -38,6 +38,7 @@ from .bnn import * from .qnn import * from .upsampling import * +from .layer_norm import layer_norm from .local_response_norm import * from .bitserial_conv2d import * from .bitserial_dense import * diff --git a/python/tvm/topi/nn/layer_norm.py b/python/tvm/topi/nn/layer_norm.py new file mode 100644 index 000000000000..3bdeaaac61a5 --- /dev/null +++ b/python/tvm/topi/nn/layer_norm.py @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Layer normalization operator.""" +from .. import cpp + + +def layer_norm(data, gamma, beta, axis, epsilon=1e-5): + """Layer normalization operator. + + Parameters + ---------- + data : tvm.te.Tensor + N-D with shape (d_0, d_1, ..., d_{N-1}) + + gamma: tvm.te.Tensor + K-D with shape (r_0, r_1, ..., r_{K-1}) where K == len(axis) and d_{axis_k} == r_k + + beta: tvm.te.Tensor + Optional, K-D with shape (r_0, r_1, ..., r_{K-1}) where K == len(axis) and d_{axis_k} == r_k + + axis : list of int + Axis over the normalization applied + + epsilon : float + The epsilon value to avoid division by zero. + + Returns + ------- + result : tvm.te.Tensor + N-D with shape (d_0, d_1, ..., d_{N-1}) + """ + return cpp.nn.layer_norm(data, gamma, beta, axis, epsilon) diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py index 2f091cba10b7..2922c30b505c 100644 --- a/python/tvm/topi/testing/__init__.py +++ b/python/tvm/topi/testing/__init__.py @@ -43,6 +43,7 @@ from .reorg_python import reorg_python from .roi_align_python import roi_align_nchw_python, roi_align_nhwc_python from .roi_pool_python import roi_pool_nchw_python +from .layer_norm_python import layer_norm_python from .lrn_python import lrn_python from .l2_normalize_python import l2_normalize_python from .gather_python import gather_python diff --git a/python/tvm/topi/testing/layer_norm_python.py b/python/tvm/topi/testing/layer_norm_python.py new file mode 100644 index 000000000000..6b3b00146983 --- /dev/null +++ b/python/tvm/topi/testing/layer_norm_python.py @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals +"""Layer normalization in python""" +import numpy as np + + +def layer_norm_python(data, gamma, beta, axis, epsilon=1e-5): + """Layer normalization operator in Python. + + Parameters + ---------- + data : numpy.ndarray + N-D with shape (d_0, d_1, ..., d_{N-1}) + + gamma: numpy.ndarray + K-D with shape (r_0, r_1, ..., r_{K-1}) where K == len(axis) and d_{axis_k} == r_k + + beta: numpy.ndarray + Optional, K-D with shape (r_0, r_1, ..., r_{K-1}) where K == len(axis) and d_{axis_k} == r_k + + axis : int or tuple of ints + Axis over the normalization applied + + epsilon : float + The epsilon value to avoid division by zero. + + Returns + ------- + result : np.ndarray + N-D with shape (d_0, d_1, ..., d_{N-1}) + """ + mean = np.mean(data, axis, keepdims=True) + var = np.var(data, axis, keepdims=True) + result = (data - mean) / np.sqrt(var + epsilon) + result *= gamma + if beta is not None: + result += beta + return result diff --git a/src/tir/schedule/primitive/reduction.cc b/src/tir/schedule/primitive/reduction.cc index dd2bcf727c40..bb43df1ce914 100644 --- a/src/tir/schedule/primitive/reduction.cc +++ b/src/tir/schedule/primitive/reduction.cc @@ -330,6 +330,15 @@ struct ReducerRegistry { [](const Array& values) { return Array{min_value(values[0]->dtype)}; }), + CreateReducerGetter( + /*n_buffers=*/2, + [](const Array& x, const Array& y) { + return Array{x[0] + y[0], x[1] + y[1]}; + }, + [](const Array& values) { + return Array{make_const(values[0]->dtype, 0), + make_const(values[1]->dtype, 0)}; + }), CreateReducerGetter( /*n_buffers=*/2, [](const Array& x, const Array& y) { diff --git a/src/topi/nn.cc b/src/topi/nn.cc index 2950aee4e90d..35dbf3a03e4f 100644 --- a/src/topi/nn.cc +++ b/src/topi/nn.cc @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -157,5 +158,10 @@ TVM_REGISTER_GLOBAL("topi.nn.binary_dense").set_body([](TVMArgs args, TVMRetValu *rv = nn::binary_dense(args[0], args[1]); }); +/* Ops from nn/layer_norm.h */ +TVM_REGISTER_GLOBAL("topi.nn.layer_norm").set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = nn::layer_norm(args[0], args[1], args[2], args[3], static_cast(args[4])); +}); + } // namespace topi } // namespace tvm diff --git a/tests/python/topi/python/test_topi_layer_norm.py b/tests/python/topi/python/test_topi_layer_norm.py new file mode 100644 index 000000000000..ead05470be3b --- /dev/null +++ b/tests/python/topi/python/test_topi_layer_norm.py @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Test code for layer_norm.""" +import numpy as np +import pytest +import tvm +from tvm import te +from tvm import topi +from tvm.topi.utils import get_const_tuple +import tvm.topi.testing + +import tvm.testing + + +_layer_norm_schedule = { + "generic": topi.generic.schedule_injective, +} + + +# only test on llvm because schedule is missing +@tvm.testing.parametrize_targets("llvm") +@pytest.mark.parametrize("shape,axis", [([4, 16], (1,)), ([4, 16, 16], (1, 2))]) +def test_layer_norm(target, dev, shape, axis, episilon=1e-5, dtype="float32", rtol=1e-5, atol=1e-5): + data = te.placeholder(shape, dtype=dtype, name="data") + scale_shape = [shape[dim] for dim in axis] + gamma = te.placeholder(scale_shape, dtype=dtype, name="gamma") + beta = te.placeholder(scale_shape, dtype=dtype, name="beta") + B = topi.nn.layer_norm(data, gamma, beta, axis, episilon) + + data_np = np.random.uniform(size=shape).astype(dtype) + gamma_np = np.random.uniform(size=scale_shape).astype(dtype) + beta_np = np.random.uniform(size=scale_shape).astype(dtype) + b_np = tvm.topi.testing.layer_norm_python(data_np, gamma_np, beta_np, axis, episilon) + + with tvm.target.Target(target): + s_func = tvm.topi.testing.dispatch(target, _layer_norm_schedule) + s = s_func([B]) + data_tvm = tvm.nd.array(data_np, dev) + gamma_tvm = tvm.nd.array(gamma_np, dev) + beta_tvm = tvm.nd.array(beta_np, dev) + b_tvm = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev) + f = tvm.build(s, [data, gamma, beta, B], target) + f(data_tvm, gamma_tvm, beta_tvm, b_tvm) + tvm.testing.assert_allclose(b_tvm.asnumpy(), b_np, rtol=rtol, atol=atol) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py index ff1353d2265e..8c139b710e23 100644 --- a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py +++ b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py @@ -1002,6 +1002,151 @@ def lowered_argmin_split_init_update_reordered( argmin_v1[i] = cross_thread_argmin_v1[0] +@T.prim_func +def layer_norm_tuple_sum( + data: T.Buffer[(128, 768), "float32"], + gamma: T.Buffer[768, "float32"], + bias: T.Buffer[768, "float32"], + T_layer_norm: T.Buffer[(128, 768), "float32"], +) -> None: + data_red_temp_v0 = T.alloc_buffer([128], dtype="float32") + data_red_temp_v1 = T.alloc_buffer([128], dtype="float32") + for i0_fused in T.thread_binding(128, thread="blockIdx.x"): + for i1_0 in T.serial(24): + for i1_1 in T.thread_binding(32, thread="threadIdx.x"): + with T.block("data_red_temp"): + ax0 = T.axis.spatial(128, i0_fused) + k1 = T.axis.reduce(768, i1_0 * 32 + i1_1) + T.reads(data[ax0, k1]) + T.writes(data_red_temp_v0[ax0], data_red_temp_v1[ax0]) + with T.init(): + data_red_temp_v0[ax0] = T.float32(0) + data_red_temp_v1[ax0] = T.float32(0) + v_data_red_temp_v0: T.float32 = data_red_temp_v0[ax0] + data[ax0, k1] + v_data_red_temp_v1: T.float32 = ( + data_red_temp_v1[ax0] + data[ax0, k1] * data[ax0, k1] + ) + data_red_temp_v0[ax0] = v_data_red_temp_v0 + data_red_temp_v1[ax0] = v_data_red_temp_v1 + for i0_i1_fused_0 in T.thread_binding(384, thread="blockIdx.x"): + for i0_i1_fused_1 in T.thread_binding(256, thread="threadIdx.x"): + with T.block("T_layer_norm"): + ax0 = T.axis.spatial(128, (i0_i1_fused_0 * 256 + i0_i1_fused_1) // 768) + ax1 = T.axis.spatial(768, (i0_i1_fused_0 * 256 + i0_i1_fused_1) % 768) + T.reads( + data[ax0, ax1], + data_red_temp_v0[ax0], + data_red_temp_v1[ax0], + gamma[ax1], + bias[ax1], + ) + T.writes(T_layer_norm[ax0, ax1]) + T_layer_norm[ax0, ax1] = ( + data[ax0, ax1] - data_red_temp_v0[ax0] * T.float32(0.0013020833333333333) + ) * T.rsqrt( + data_red_temp_v1[ax0] * T.float32(0.0013020833333333333) + - data_red_temp_v0[ax0] + * T.float32(0.0013020833333333333) + * (data_red_temp_v0[ax0] * T.float32(0.0013020833333333333)) + + T.float32(1.0000000000000001e-05), + dtype="float32", + ) * gamma[ + ax1 + ] + bias[ + ax1 + ] + + +@T.prim_func +def lowered_layer_norm_tuple_sum( + data: T.Buffer[(128, 768), "float32"], + gamma: T.Buffer[768, "float32"], + bias: T.Buffer[768, "float32"], + T_layer_norm: T.Buffer[(128, 768), "float32"], +) -> None: + # with T.block("root") + data_red_temp_v0 = T.alloc_buffer([128], dtype="float32") + data_red_temp_v1 = T.alloc_buffer([128], dtype="float32") + cross_thread_data_red_temp_v0 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local") + cross_thread_data_red_temp_v1 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local") + in_thread_data_red_temp_v0 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local") + in_thread_data_red_temp_v1 = T.alloc_buffer([1], dtype="float32", strides=[1], scope="local") + for i0_fused in T.thread_binding(128, thread="blockIdx.x"): + for i1_1 in T.thread_binding(32, thread="threadIdx.x"): + with T.block("data_red_temp_in_thread_init"): + T.reads() + T.writes(in_thread_data_red_temp_v0[0], in_thread_data_red_temp_v1[0]) + in_thread_data_red_temp_v0[0] = T.float32(0) + in_thread_data_red_temp_v1[0] = T.float32(0) + for i1_0 in T.serial(24): + with T.block("data_red_temp_in_thread"): + ax0 = T.axis.spatial(128, i0_fused) + k1 = T.axis.reduce(768, i1_0 * 32 + i1_1) + T.reads(data[ax0, k1]) + T.writes(in_thread_data_red_temp_v0[0], in_thread_data_red_temp_v1[0]) + v_data_red_temp_v0: T.float32 = in_thread_data_red_temp_v0[0] + data[ax0, k1] + v_data_red_temp_v1: T.float32 = ( + in_thread_data_red_temp_v1[0] + data[ax0, k1] * data[ax0, k1] + ) + in_thread_data_red_temp_v0[0] = v_data_red_temp_v0 + in_thread_data_red_temp_v1[0] = v_data_red_temp_v1 + with T.block("data_red_temp_cross_thread"): + T.reads(in_thread_data_red_temp_v0[0], in_thread_data_red_temp_v1[0]) + T.writes(cross_thread_data_red_temp_v0[0], cross_thread_data_red_temp_v1[0]) + T.attr( + T.comm_reducer( + lambda x0, x1, y0, y1: (x0 + y0, x1 + y1), [T.float32(0), T.float32(0)] + ), + "reduce_scope", + T.reinterpret(T.uint64(0), dtype="handle"), + ) + T.evaluate( + T.tvm_thread_allreduce( + T.uint32(2), + in_thread_data_red_temp_v0[0], + in_thread_data_red_temp_v1[0], + True, + cross_thread_data_red_temp_v0[0], + cross_thread_data_red_temp_v1[0], + i1_1, + dtype="handle", + ) + ) + with T.block("data_red_temp_write_back"): + ax0 = T.axis.spatial(128, i0_fused) + T.reads(cross_thread_data_red_temp_v0[0], cross_thread_data_red_temp_v1[0]) + T.writes(data_red_temp_v0[ax0], data_red_temp_v1[ax0]) + data_red_temp_v0[ax0] = cross_thread_data_red_temp_v0[0] + data_red_temp_v1[ax0] = cross_thread_data_red_temp_v1[0] + for i0_i1_fused_0 in T.thread_binding(384, thread="blockIdx.x"): + for i0_i1_fused_1 in T.thread_binding(256, thread="threadIdx.x"): + with T.block("T_layer_norm"): + ax0 = T.axis.spatial(128, (i0_i1_fused_0 * 256 + i0_i1_fused_1) // 768) + ax1 = T.axis.spatial(768, (i0_i1_fused_0 * 256 + i0_i1_fused_1) % 768) + T.reads( + data[ax0, ax1], + data_red_temp_v0[ax0], + data_red_temp_v1[ax0], + gamma[ax1], + bias[ax1], + ) + T.writes(T_layer_norm[ax0, ax1]) + T_layer_norm[ax0, ax1] = ( + data[ax0, ax1] - data_red_temp_v0[ax0] * T.float32(0.0013020833333333333) + ) * T.rsqrt( + data_red_temp_v1[ax0] * T.float32(0.0013020833333333333) + - data_red_temp_v0[ax0] + * T.float32(0.0013020833333333333) + * (data_red_temp_v0[ax0] * T.float32(0.0013020833333333333)) + + T.float32(1.0000000000000001e-05), + dtype="float32", + ) * gamma[ + ax1 + ] + bias[ + ax1 + ] + + # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg @@ -1087,5 +1232,9 @@ def test_lower_te(): ) # LowerCrossThreadReduction should do nothing on TE +def test_layer_norm_tuple_sum(): + _check(layer_norm_tuple_sum, lowered_layer_norm_tuple_sum) + + if __name__ == "__main__": tvm.testing.main() From 0b074d8f06dc411bcf779e8b59645626228bf5ce Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 22 Sep 2022 14:38:41 -0700 Subject: [PATCH 231/704] Fix clang warnings (#12876) --- include/tvm/tir/stmt_functor.h | 2 +- src/relay/transforms/annotate_texture_storage.cc | 4 ++-- src/script/printer/doc.cc | 2 +- src/tir/schedule/primitive/pad_einsum.cc | 5 +++-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h index 2fc3b9678b40..17530380e665 100644 --- a/include/tvm/tir/stmt_functor.h +++ b/include/tvm/tir/stmt_functor.h @@ -442,7 +442,7 @@ template virtual_device_ = VirtualDevice(virtual_device->device_type(), virtual_device->virtual_device_id, virtual_device->target, storage_scope_[GetRef(vn)][Expr()][0]); - return c; + return std::move(c); } return GetRef(vn); } @@ -520,7 +520,7 @@ class RewriteVDStorageScopes : public transform::DeviceAwareExprMutator { virtual_device->target, memory_scope), true); } - return new_call; + return std::move(new_call); } private: diff --git a/src/script/printer/doc.cc b/src/script/printer/doc.cc index f3b431bd62db..1ca7ced8e8a7 100644 --- a/src/script/printer/doc.cc +++ b/src/script/printer/doc.cc @@ -30,7 +30,7 @@ ExprDoc ExprDocNode::Attr(String attr) const { return AttrAccessDoc(GetRef attr) const { auto doc = AttrAccessDoc(GetRef(this), attr.Get()); doc->source_paths.push_back(attr.GetPath()); - return doc; + return std::move(doc); } ExprDoc ExprDocNode::operator[](Array indices) const { diff --git a/src/tir/schedule/primitive/pad_einsum.cc b/src/tir/schedule/primitive/pad_einsum.cc index 7a7b88d686f9..2190dc69d33d 100644 --- a/src/tir/schedule/primitive/pad_einsum.cc +++ b/src/tir/schedule/primitive/pad_einsum.cc @@ -227,6 +227,8 @@ class PadEinsumRewriter : public ReplaceBufferMutator { producer_predicate_(producer_predicate), padded_iter_extents_(padded_iter_extents), analyzer_(analyzer) {} + using ReplaceBufferMutator::VisitExpr_; + using ReplaceBufferMutator::VisitStmt_; Stmt VisitStmt_(const ForNode* op) final { For new_for = Downcast(ReplaceBufferMutator::VisitStmt_(op)); @@ -371,8 +373,7 @@ void PadEinsum(ScheduleState self, const StmtSRef& block_sref, const Array buffer_remap; // mapping from buffers to new buffers with padded shapes // Utility function to pad a buffer with the new shape - auto f_pad_buffer = [&padded_iter_extents, &buffer_remap](Buffer buffer, - const Array& indices) -> Buffer { + auto f_pad_buffer = [&padded_iter_extents](Buffer buffer, const Array& indices) -> Buffer { Array new_shape; for (const Var& index : indices) { new_shape.push_back(padded_iter_extents.at(index)); From ce8ac3e78454462025c03197767af764387df4ff Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 22 Sep 2022 20:43:10 -0700 Subject: [PATCH 232/704] [TIR] Allow missing TensorIntrin in registry lookup (#12875) Added an option to allow missing tensor intrin. --- include/tvm/tir/function.h | 7 +++++-- python/tvm/tir/function.py | 12 ++++++++---- .../schedule_rule/multi_level_tiling_tensor_core.cc | 2 +- .../schedule_rule/multi_level_tiling_with_intrin.cc | 2 +- src/tir/ir/function.cc | 12 +++++++++--- src/tir/schedule/concrete_schedule.cc | 4 ++-- src/tir/schedule/transform.cc | 2 +- tests/python/unittest/test_tir_schedule_tensorize.py | 7 +++++++ 8 files changed, 34 insertions(+), 14 deletions(-) diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h index bb6287c69b16..d793d84fc677 100644 --- a/include/tvm/tir/function.h +++ b/include/tvm/tir/function.h @@ -234,10 +234,13 @@ class TensorIntrin : public ObjectRef { /*! * \brief Look up TensorIntrin by name. Raises an exception if not found. * \param name The name of the TensorIntrin. + * \param allow_missing Whether to allow missing tensor intrin. If false, an exception is raised + * if the tensor intrin is not found. * \return The TensorIntrin with the specified name. - * \throws This method throws an exception if the TensorIntrin does not exist. + * \throws This method throws an exception if the TensorIntrin does not exist and allow_missing is + * false. */ - TVM_DLL static TensorIntrin Get(String name); + TVM_DLL static Optional Get(String name, bool allow_missing = false); TVM_DEFINE_OBJECT_REF_METHODS(TensorIntrin, ObjectRef, TensorIntrinNode) }; diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py index df39f8aebf71..dd684bc4f1ae 100644 --- a/python/tvm/tir/function.py +++ b/python/tvm/tir/function.py @@ -245,7 +245,7 @@ def register(name: str, desc: PrimFunc, impl: PrimFunc, override: bool = False): ) # type: ignore @staticmethod - def get(name: str): + def get(name: str, allow_missing: bool = False) -> Optional["TensorIntrin"]: """Look up a tensor intrinsic by its name. Parameters @@ -253,12 +253,16 @@ def get(name: str): name : str The name of the TensorIntrin to look up. + allow_missing : bool + Whether to allow missing tensor intrin. If False, raise an error if the tensor intrin + doesn't exist. + Returns ------- - result : TensorIntrin - The TensorIntrin with the specified name. + result : Optional[TensorIntrin] + The TensorIntrin with the specified name, or None if not found. """ - return _ffi_api.TensorIntrinGet(name) # pylint: type: ignore + return _ffi_api.TensorIntrinGet(name, allow_missing) # pylint: type: ignore @tvm._ffi.register_object("tir.IndexMap") diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc index 290a85b2579b..fbf9aa19b711 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc @@ -184,7 +184,7 @@ Array MultiLevelTilingTensorCoreNode::Apply(const Schedule& sch, TensorCoreIntrinGroup intrin_group = intrin_groups[i]; Optional mapping_info = tir::GetAutoTensorizeMappingInfo( sch->state(), sch->GetSRef(block_rv), - tir::TensorIntrin::Get(intrin_groups[i].compute_intrin)->desc); + tir::TensorIntrin::Get(intrin_groups[i].compute_intrin).value()->desc); if (mapping_info.defined()) { intrin_group_to_mapping_info.emplace(i, mapping_info.value()); } diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc index b953d1ad4b50..8485e697eb24 100644 --- a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc +++ b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc @@ -47,7 +47,7 @@ Optional TileForIntrin(tir::Schedule sch, tir::BlockRV block, class MultiLevelTilingWithIntrinNode : public MultiLevelTilingNode { protected: Array Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) final { - auto desc_func = tir::TensorIntrin::Get(intrin_name)->desc; + auto desc_func = tir::TensorIntrin::Get(intrin_name).value()->desc; if (!CheckAutoTensorizeApplicable(sch, block_rv, desc_func)) { TVM_PY_LOG(INFO, logging_func) << "The workload cannot be tensorized."; return {sch}; diff --git a/src/tir/ir/function.cc b/src/tir/ir/function.cc index 028ff1220785..c609ad158e34 100644 --- a/src/tir/ir/function.cc +++ b/src/tir/ir/function.cc @@ -97,11 +97,17 @@ void TensorIntrin::Register(String name, TensorIntrin intrin, bool override) { manager->reg.Set(name, intrin); } -TensorIntrin TensorIntrin::Get(String name) { +Optional TensorIntrin::Get(String name, bool allow_missing) { const TensorIntrinManager* manager = TensorIntrinManager::Global(); auto it = manager->reg.find(name); - CHECK(it != manager->reg.end()) << "ValueError: TensorIntrin '" << name << "' is not registered"; - return manager->reg.at(name); + if (it == manager->reg.end()) { + if (allow_missing) { + return NullOpt; + } else { + LOG(FATAL) << "ValueError: TensorIntrin '" << name << "' is not registered"; + } + } + return (*it).second; } TVM_REGISTER_NODE_TYPE(TensorIntrinNode); diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc index 4558ad04baed..8cfbadf65012 100644 --- a/src/tir/schedule/concrete_schedule.cc +++ b/src/tir/schedule/concrete_schedule.cc @@ -675,14 +675,14 @@ BlockRV ConcreteScheduleNode::Blockize(const LoopRV& loop_rv) { void ConcreteScheduleNode::Tensorize(const LoopRV& loop_rv, const String& intrin) { TVM_TIR_SCHEDULE_BEGIN(); - tir::Tensorize(state_, this->GetSRef(loop_rv), tir::TensorIntrin::Get(intrin)); + tir::Tensorize(state_, this->GetSRef(loop_rv), tir::TensorIntrin::Get(intrin).value()); this->state_->DebugVerify(); TVM_TIR_SCHEDULE_END("tensorize", this->error_render_level_); } void ConcreteScheduleNode::Tensorize(const BlockRV& block_rv, const String& intrin) { TVM_TIR_SCHEDULE_BEGIN(); - tir::Tensorize(state_, this->GetSRef(block_rv), tir::TensorIntrin::Get(intrin)); + tir::Tensorize(state_, this->GetSRef(block_rv), tir::TensorIntrin::Get(intrin).value()); this->state_->DebugVerify(); TVM_TIR_SCHEDULE_END("tensorize", this->error_render_level_); } diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc index d99cc199fe5f..7a720fe3eae2 100644 --- a/src/tir/schedule/transform.cc +++ b/src/tir/schedule/transform.cc @@ -291,7 +291,7 @@ Optional TileWithTensorIntrin(const tir::Schedule& sch, const tir::Block const String& intrin_name, bool allow_padding) { Optional opt_tensorize_info = GetTensorizeLoopMapping(sch->state(), sch->GetSRef(block_rv), - tir::TensorIntrin::Get(intrin_name)->desc, allow_padding); + tir::TensorIntrin::Get(intrin_name).value()->desc, allow_padding); if (!opt_tensorize_info) return NullOpt; const tir::TensorizeInfoNode* info = opt_tensorize_info.value().get(); if (info->block_iter_paddings.defined()) { diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py index 828dad2fc036..f04de8e0051f 100644 --- a/tests/python/unittest/test_tir_schedule_tensorize.py +++ b/tests/python/unittest/test_tir_schedule_tensorize.py @@ -646,5 +646,12 @@ def fetch_to_shared(block, idx): verify_trace_roundtrip(sch=sch, mod=func) +def test_tensor_intrin_look_up(): + intrin_name = 'non_existent_intrin' + assert tir.TensorIntrin.get(intrin_name, allow_missing=True) is None + with pytest.raises(ValueError): + tir.TensorIntrin.get(intrin_name) + + if __name__ == "__main__": tvm.testing.main() From 9ce95a9abe3db43b4a4187111c9e2ad0d6bf3dbd Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 22 Sep 2022 20:43:22 -0700 Subject: [PATCH 233/704] [TIR] Fix wmma index in CUDA tensor intrins (#12879) --- python/tvm/tir/tensor_intrin/cuda.py | 77 ++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 15 deletions(-) diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py index a309b091285b..3374d18dff80 100644 --- a/python/tvm/tir/tensor_intrin/cuda.py +++ b/python/tvm/tir/tensor_intrin/cuda.py @@ -489,10 +489,13 @@ def mma_store_impl(a: T.handle, c: T.handle) -> None: ######## WMMA intrinsics ######## -def get_wmma_fragment_index(buffer, m_dim, n_dim): +def get_wmma_fragment_index(buffer, stride, m_dim, n_dim): """Compute wmma fragment index using elem_offset of the buffer""" - frag_size = lift(m_dim * n_dim) - return buffer.elem_offset // frag_size + (buffer.elem_offset % frag_size) // n_dim + frag_index_m = buffer.elem_offset // stride // m_dim + frag_index_n = buffer.elem_offset % stride // n_dim + + num_fragments_per_row = stride // n_dim + return frag_index_m * num_fragments_per_row + frag_index_n def get_wmma_load_intrin( @@ -526,6 +529,8 @@ def wmma_load_desc(a: T.handle, c: T.handle) -> None: def wmma_load_impl(a: T.handle, c: T.handle) -> None: s1 = T.var("int32") s0 = T.var("int32") + d1 = T.var("int32") + d0 = T.var("int32") A = T.match_buffer( a, (m_dim, n_dim), @@ -536,7 +541,13 @@ def wmma_load_impl(a: T.handle, c: T.handle) -> None: strides=[s1, s0], ) C = T.match_buffer( - c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=wmma_fragment_scope + c, + (m_dim, n_dim), + dtype, + align=64, + offset_factor=16, + scope=wmma_fragment_scope, + strides=[d1, d0], ) with T.block("root"): T.reads(A[0:m_dim, 0:n_dim]) @@ -547,7 +558,7 @@ def wmma_load_impl(a: T.handle, c: T.handle) -> None: m_dim, n_dim, k_dim, - get_wmma_fragment_index(C, m_dim, n_dim), + get_wmma_fragment_index(C, d1, m_dim, n_dim), A.access_ptr("r"), s1, layout, @@ -579,8 +590,16 @@ def wmma_fill_desc(c: T.handle) -> None: @T.prim_func def wmma_fill_impl(c: T.handle) -> None: + d1 = T.var("int32") + d0 = T.var("int32") C = T.match_buffer( - c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator" + c, + (m_dim, n_dim), + dtype, + align=64, + offset_factor=16, + scope="wmma.accumulator", + strides=[d1, d0], ) with T.block("root"): T.reads() @@ -591,7 +610,7 @@ def wmma_fill_impl(c: T.handle) -> None: m_dim, n_dim, k_dim, - get_wmma_fragment_index(C, m_dim, n_dim), + get_wmma_fragment_index(C, d1, m_dim, n_dim), T.float32(0), dtype="handle", ) @@ -623,8 +642,16 @@ def wmma_store_desc(a: T.handle, c: T.handle) -> None: def wmma_store_impl(a: T.handle, c: T.handle) -> None: s1 = T.var("int32") s0 = T.var("int32") + d1 = T.var("int32") + d0 = T.var("int32") A = T.match_buffer( - a, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope="wmma.accumulator" + a, + (m_dim, n_dim), + dtype, + align=64, + offset_factor=16, + scope="wmma.accumulator", + strides=[d1, d0], ) C = T.match_buffer( c, (m_dim, n_dim), dtype, align=64, offset_factor=16, scope=scope, strides=[s1, s0] @@ -638,7 +665,7 @@ def wmma_store_impl(a: T.handle, c: T.handle) -> None: m_dim, n_dim, k_dim, - get_wmma_fragment_index(A, m_dim, n_dim), + get_wmma_fragment_index(A, d1, m_dim, n_dim), C.access_ptr("w"), s1, "row_major", @@ -696,8 +723,21 @@ def wmma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None: @T.prim_func def wmma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None: + a1 = T.var("int32") + a0 = T.var("int32") + b1 = T.var("int32") + b0 = T.var("int32") + c1 = T.var("int32") + c0 = T.var("int32") + A = T.match_buffer( - a, (m_dim, k_dim), in_dtype, align=64, offset_factor=16, scope="wmma.matrix_a" + a, + (m_dim, k_dim), + in_dtype, + align=64, + offset_factor=16, + scope="wmma.matrix_a", + strides=[a1, a0], ) B = T.match_buffer( b, @@ -706,9 +746,16 @@ def wmma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None: align=64, offset_factor=16, scope="wmma.matrix_b", + strides=[b1, b0], ) C = T.match_buffer( - c, (m_dim, n_dim), out_dtype, align=64, offset_factor=16, scope="wmma.accumulator" + c, + (m_dim, n_dim), + out_dtype, + align=64, + offset_factor=16, + scope="wmma.accumulator", + strides=[c1, c0], ) with T.block("root"): @@ -717,13 +764,13 @@ def wmma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None: T.evaluate( T.tvm_mma_sync( C.data, - get_wmma_fragment_index(C, m_dim, n_dim), + get_wmma_fragment_index(C, c1, m_dim, n_dim), A.data, - get_wmma_fragment_index(A, m_dim, k_dim), + get_wmma_fragment_index(A, a1, m_dim, k_dim), B.data, - get_wmma_fragment_index(B, b_shape_0, b_shape_1), + get_wmma_fragment_index(B, b1, b_shape_0, b_shape_1), C.data, - get_wmma_fragment_index(C, m_dim, n_dim), + get_wmma_fragment_index(C, c1, m_dim, n_dim), dtype="handle", ) ) From d80ce6b1ba5439dbe0437be6e37121844f87a113 Mon Sep 17 00:00:00 2001 From: AndrewZhaoLuo Date: Fri, 23 Sep 2022 00:33:19 -0700 Subject: [PATCH 234/704] [EZ][Release] Update gather PRs Script (#12862) Update internal path to account for directory structure change in TVM repository, with the introduction of `ci` directory. --- tests/scripts/release/gather_prs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/scripts/release/gather_prs.py b/tests/scripts/release/gather_prs.py index 0720a87d042b..5fbfa2278feb 100644 --- a/tests/scripts/release/gather_prs.py +++ b/tests/scripts/release/gather_prs.py @@ -25,11 +25,12 @@ from typing import Callable, Dict, List, Any REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent -sys.path.append(str(REPO_ROOT / "tests" / "scripts")) +sys.path.append(str(REPO_ROOT / "ci" / "scripts")) from git_utils import git, GitHubRepo from github_tag_teams import tags_from_title + GITHUB_TOKEN = os.environ["GITHUB_TOKEN"] From e8aeb4adf3525837db5f24965104640163b38f0e Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Fri, 23 Sep 2022 16:57:18 +0100 Subject: [PATCH 235/704] [CI] Add Zephyr-SDK binaries to PATH env. in ci_cortexm (#12884) In recent test rounds with updated images, it seems Zephyr-SDK binaries such as various QEMU related files are missing from $PATH, which makes Zephyr tests to fail with, e.g. "qemu-system-i386: command not found". This PR adds those missing binaries to $PATH. Co-authored-by: Gustavo Romero --- docker/Dockerfile.ci_cortexm | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/Dockerfile.ci_cortexm b/docker/Dockerfile.ci_cortexm index db02792efda9..8e8d2c0a4f9e 100644 --- a/docker/Dockerfile.ci_cortexm +++ b/docker/Dockerfile.ci_cortexm @@ -79,6 +79,7 @@ COPY install/ubuntu_init_zephyr_project.sh /install/ubuntu_init_zephyr_project.s COPY install/ubuntu_install_zephyr_sdk.sh /install/ubuntu_install_zephyr_sdk.sh RUN bash /install/ubuntu_install_zephyr.sh ENV ZEPHYR_BASE=/opt/zephyrproject/zephyr +ENV PATH /opt/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin:$PATH # FreeRTOS deps COPY install/ubuntu_install_freertos.sh /install/ubuntu_install_freertos.sh From eba75e4640d68989cd850ef66bdac0061e873d92 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Fri, 23 Sep 2022 13:30:52 -0700 Subject: [PATCH 236/704] [METASCHEDULE] Mark work_dir as not optional in docs (#12888) --- python/tvm/meta_schedule/tune.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py index 20eccc30a113..b1cc0f67bd5f 100644 --- a/python/tvm/meta_schedule/tune.py +++ b/python/tvm/meta_schedule/tune.py @@ -259,7 +259,7 @@ def tune_extracted_tasks( The list of extracted tasks. config : TuneConfig The search strategy config. - work_dir : Optional[str] + work_dir : str The working directory to save intermediate results. builder : Optional[Builder] The builder to use. @@ -380,7 +380,7 @@ def tune_tir( The target to tune for. config : TuneConfig The search strategy config. - work_dir : Optional[str] + work_dir : str The working directory to save intermediate results. builder : Optional[Builder] The builder to use. @@ -499,7 +499,7 @@ def tune_te( The search strategy config. task_name : str The name of the task. - work_dir : Optional[str] + work_dir : str The working directory to save intermediate results. builder : Optional[Builder] The builder to use. @@ -569,7 +569,7 @@ def tune_relay( The associated parameters of the program task_name : str The name of the task. - work_dir : Optional[str] + work_dir : str The working directory to save intermediate results. builder : Optional[Builder] The builder to use. From 428269f80ca869a2fdee09af1683989448cd6bd4 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Fri, 23 Sep 2022 15:38:56 -0700 Subject: [PATCH 237/704] [FIX,PROFILING] Fix PAPI docs (#12861) The VM requires arguements to not be wrapped in an array. Passing the arguments unwrapped now. Also added relevant imports. --- docs/how_to/profile/papi.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/how_to/profile/papi.rst b/docs/how_to/profile/papi.rst index 78d512c9888b..02643451aa09 100644 --- a/docs/how_to/profile/papi.rst +++ b/docs/how_to/profile/papi.rst @@ -62,6 +62,12 @@ is an example: .. code:: python + import tvm + from tvm import relay + from tvm.relay.testing import mlp + from tvm.runtime import profiler_vm + import numpy as np + target = "llvm" dev = tvm.cpu() mod, params = mlp.get_workload(1) @@ -71,7 +77,7 @@ is an example: data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev) report = vm.profile( - [data], + data, func_name="main", collectors=[tvm.runtime.profiling.PAPIMetricCollector()], ) @@ -94,7 +100,7 @@ You can also change which metrics are collected: .. code:: python report = vm.profile( - [data], + data, func_name="main", collectors=[tvm.runtime.profiling.PAPIMetricCollector({dev: ["PAPI_FP_OPS"])], ) From fbb500e92f4f3104710ae4119871e6c105de4dc4 Mon Sep 17 00:00:00 2001 From: multiverstack-intellif <39256082+multiverstack@users.noreply.github.com> Date: Sat, 24 Sep 2022 21:08:46 +0800 Subject: [PATCH 238/704] [TIR][Schedule] Relax cache read/write's restriction and fix unexpected behavior (#12766) [TIR][Schedule] Relax cache read/write's restriction and fix unexpected behavior. Co-authored-by: Min Chen --- .../schedule/primitive/cache_read_write.cc | 76 +++++++++++++------ src/tir/schedule/state.cc | 1 + .../test_tir_schedule_cache_read_write.py | 63 ++++++++++++++- 3 files changed, 114 insertions(+), 26 deletions(-) diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc index a221733eb394..c76e6abaebb5 100644 --- a/src/tir/schedule/primitive/cache_read_write.cc +++ b/src/tir/schedule/primitive/cache_read_write.cc @@ -382,9 +382,16 @@ class CacheLocDetector : public StmtVisitor { static void Detect(const ScheduleState& self, const StmtSRef& block_sref, const StmtSRef& scope_sref, CacheStageInfo* info) { std::vector related_blocks; - for (const Dependency& def : self->GetBlockScope(scope_sref)->GetDepsBySrc(block_sref)) { - if (def->kind == DepKind::kRAW) { - related_blocks.push_back(def->dst); + // If consumer is specified, skip detecting the others + if (info->consumer_blocks.size() > 0) { + for (StmtSRef consumer : info->consumer_blocks) { + related_blocks.emplace_back(consumer); + } + } else { + for (const Dependency& def : self->GetBlockScope(scope_sref)->GetDepsBySrc(block_sref)) { + if (def->kind == DepKind::kRAW) { + related_blocks.push_back(def->dst); + } } } if (!related_blocks.empty()) { @@ -416,29 +423,24 @@ class CacheLocDetector : public StmtVisitor { void VisitStmt_(const SeqStmtNode* seq_stmt) final { bool previous_visited_block = visited_block_; - bool previous_visited_related = visited_related_; - visited_block_ = visited_related_ = false; + visited_block_ = false; - int pos = -1; for (size_t i = 0; i < seq_stmt->size(); ++i) { if (loc_pos_ != -1) { break; } VisitStmt(seq_stmt->seq[i]); // `pos` can be assigned only once when we visited `block_sref` - if (visited_block_ && visited_related_ && pos == -1) { + if (visited_block_ && visited_related_ && loc_pos_ == -1) { // The offset of insert position from the block - pos = i; + loc_pos_ = i; + return; + } else if (visited_related_) { + // If meet the target consumer, stop searching + visited_block_ = visited_block_ || previous_visited_block; + return; } } - visited_block_ = visited_block_ || previous_visited_block; - visited_related_ = visited_related_ || previous_visited_related; - // Only we visited the writing block and any one of the related blocks - // That means that we have found the lowest ancestor - // of the block and any one of the related ones - if (visited_block_ && visited_related_ && loc_pos_ == -1) { - loc_pos_ = pos; - } } void VisitStmt_(const BlockNode* block) final { @@ -446,11 +448,12 @@ class CacheLocDetector : public StmtVisitor { if (block == scope_sref_->stmt) { // The block vistied is the current parent scope StmtVisitor::VisitStmt_(block); - // Handling cache_read for input buffer - if (visited_block_ && visited_related_ && !loc_sref_.defined()) { + // Handling cases when insert outside any loop or cache_read for input buffer + if (visited_related_ && !loc_sref_.defined()) { loc_sref_ = self_->stmt2ref.at(block); - if (loc_pos_ == -1) { - loc_pos_ = 1; + // Handling cache_read for input buffer + if (visited_block_ == false && loc_pos_ == -1) { + loc_pos_ = 0; } } return; @@ -980,6 +983,33 @@ class ReIndexRewriter : public StmtExprMutator { Region region_; }; +void CheckRegionCover(const ScheduleState& self, StmtSRef scope_root) { + class NotRegionCoverError : public ScheduleError { + public: + explicit NotRegionCoverError(IRModule mod, Block block) : mod_(mod), block_(block) {} + IRModule mod() const final { return mod_; } + String FastErrorString() const final { + return "ScheduleError: The scope root's region cover is not complete."; + } + String DetailRenderTemplate() const final { + return R"(The scope {0} 's region cover is not complete. +The region cover property require to hold for every of its child blocks +)"; + } + Array LocationsOfInterest() const final { return {block_}; } + IRModule mod_; + Block block_; + }; + BlockScope scope = self->GetBlockScope(scope_root); + for (const auto& kv : scope->dst2deps) { + const StmtSRef& consumer_block_sref = kv.first; + if (!self->block_info.at(consumer_block_sref).region_cover) { + const BlockNode* block = TVM_SREF_TO_BLOCK(scope_root); + throw NotRegionCoverError(self->mod, GetRef(block)); + } + } +} + /******** Implementation ********/ StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buffer_index, @@ -1002,7 +1032,9 @@ StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buff const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); Buffer read_buffer = GetNthAccessBuffer(self, GetRef(block), read_buffer_index, BufferIndexType::kRead); - StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true); + StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false); + // Check required region cover for cache_read + CheckRegionCover(self, scope_sref); const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_sref); // Step 2. Create CacheStageInfo @@ -1075,7 +1107,7 @@ StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_bu const BlockNode* block = TVM_SREF_TO_BLOCK(block_sref); Buffer write_buffer = GetNthAccessBuffer(self, GetRef(block), write_buffer_index, BufferIndexType::kWrite); - StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true); + StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false); // Step 2. Creating CacheStageInfo CacheStageInfo info; diff --git a/src/tir/schedule/state.cc b/src/tir/schedule/state.cc index 6d4a42236f57..27056124d9e1 100644 --- a/src/tir/schedule/state.cc +++ b/src/tir/schedule/state.cc @@ -346,6 +346,7 @@ class BlockInfoCollector : private StmtVisitor { if (!ProducerCoversConsumer(buffer->shape, produced_region, consumed_region, &analyzer_)) { region_cover = false; + self_->block_info.at(consumer_block_sref).region_cover = region_cover; break; } } diff --git a/tests/python/unittest/test_tir_schedule_cache_read_write.py b/tests/python/unittest/test_tir_schedule_cache_read_write.py index cf4836e5361e..334fb988d775 100644 --- a/tests/python/unittest/test_tir_schedule_cache_read_write.py +++ b/tests/python/unittest/test_tir_schedule_cache_read_write.py @@ -223,6 +223,24 @@ def func_with_block_predicate() -> None: B[ax] = A[ax] + 1.0 +@T.prim_func +def inplace_func(data_io: T.Buffer[(64), "int32"]): + data_1d = T.alloc_buffer([64], dtype="int32") + for i0 in T.serial(64): + with T.block("copy_in"): + v0 = T.axis.remap("S", [i0]) + data_1d[v0] = data_io[v0] + for i0 in T.serial(1): + with T.block("ext_call"): + T.reads(data_1d[:64]) + T.writes(data_1d[:64]) + T.evaluate(T.call_extern("call_impl", data_1d.data, dtype="")) + for i0 in T.serial(64): + with T.block("copy_out"): + v0 = T.axis.remap("S", [i0]) + data_io[v0] = data_1d[v0] + + ########## Expected function after cache_read ########## @@ -414,15 +432,15 @@ def cache_read_multi_consumer_target() -> None: with T.block("A"): vi = T.axis.S(128, i * 16 + j) A[vi] = 1.0 - for j in T.grid(16): - with T.block("A"): - vi = T.axis.S(128, i * 16 + j) - A_global[vi] = A[vi] for j in T.grid(16): with T.block("B"): vi = T.axis.S(128, i * 16 + j) B[vi] = A[vi] + 1.0 + for i in T.grid(128): + with T.block("A"): + vi = T.axis.S(128, i) + A_global[vi] = A[vi] for i in T.grid(128): with T.block("C"): vi = T.axis.S(128, i) @@ -501,6 +519,35 @@ def cache_read_shape_int64(var_A: T.handle, var_C: T.handle) -> None: C[vi, vj] = B[vi, vj] + T.float32(1) +@T.prim_func +def cache_read_inplace(data_io: T.Buffer[64, "int32"]) -> None: + data_1d = T.alloc_buffer([64], dtype="int32") + data_io_local = T.alloc_buffer([64], dtype="int32", scope="local") + for ax0 in T.serial(64): + with T.block("data_io_local"): + v0 = T.axis.spatial(64, ax0) + T.reads(data_io[v0]) + T.writes(data_io_local[v0]) + data_io_local[v0] = data_io[v0] + for i0 in T.serial(64): + with T.block("copy_in"): + v0 = T.axis.spatial(64, i0) + T.reads(data_io_local[v0]) + T.writes(data_1d[v0]) + data_1d[v0] = data_io_local[v0] + for i0 in T.serial(1): + with T.block("ext_call"): + T.reads(data_1d[0:64]) + T.writes(data_1d[0:64]) + T.evaluate(T.call_extern("call_impl", data_1d.data, dtype="")) + for i0 in T.serial(64): + with T.block("copy_out"): + v0 = T.axis.spatial(64, i0) + T.reads(data_1d[v0]) + T.writes(data_io[v0]) + data_io[v0] = data_1d[v0] + + ########## Expected function after cache_write ########## @@ -876,6 +923,14 @@ def test_cache_read_fail_invalid_storage_scope(use_block_name): sch.cache_read(block_b, 0, "test_scope") +def test_inplace_cache_read(): + sch = tvm.tir.Schedule(inplace_func, debug_mask="all") + block = sch.get_block("copy_in") + sch.cache_read(block, 0, "local", [block]) + tvm.ir.assert_structural_equal(cache_read_inplace, sch.mod["main"]) + verify_trace_roundtrip(sch=sch, mod=inplace_func) + + ########## Testcases for cache_write ########## From 71f25b3d6c851046e925ef6a2d2626626084913a Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Sat, 24 Sep 2022 18:28:02 -0500 Subject: [PATCH 239/704] [IR] Use TVM_DEFINE_OBJECT_REF_METHODS macro for Op (#12893) Previously, the `get()` method wasn't defined, and returned a `RelayExprNode` instead of a `Op::ContainerType*`. --- include/tvm/ir/op.h | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/include/tvm/ir/op.h b/include/tvm/ir/op.h index 683170026451..6e6b8bee5fc3 100644 --- a/include/tvm/ir/op.h +++ b/include/tvm/ir/op.h @@ -164,15 +164,6 @@ class OpNode : public RelayExprNode { */ class Op : public RelayExpr { public: - /*! \brief default constructor */ - Op() {} - /*! \brief constructor from node pointer */ - explicit Op(ObjectPtr n) : RelayExpr(n) {} - /*! - * \brief access the internal node container - * \return the pointer to the internal node container - */ - inline const OpNode* operator->() const; /*! * \brief Get additional registered attribute about operators. * If nothing has been registered, an empty OpAttrMap will be returned. @@ -196,8 +187,7 @@ class Op : public RelayExpr { */ TVM_DLL static const Op& Get(const String& op_name); - /*! \brief specify container node */ - using ContainerType = OpNode; + TVM_DEFINE_OBJECT_REF_METHODS(Op, RelayExpr, OpNode) private: /*! @@ -370,7 +360,6 @@ class OpAttrMap : public AttrRegistryMap { ::tvm::OpRegEntry::RegisterOrGet(OpName).set_name() // implementations -inline const OpNode* Op::operator->() const { return static_cast(get()); } template inline OpAttrMap Op::GetAttrMap(const String& key) { From a61c1ad0f03b53a4b5a3cc3e4a60d6daafe2b1e2 Mon Sep 17 00:00:00 2001 From: wrongtest Date: Mon, 26 Sep 2022 08:47:09 +0800 Subject: [PATCH 240/704] [TIR] Fix plan buffer allocation location for loop carried dependencies (#12757) * Fix plan buffer allocation location for loop carried dependencies * fix testcase region annotation issue * fix typo in ut --- .../analysis/buffer_access_lca_detector.cc | 106 +++++++++++++++-- ..._plan_update_buffer_allocation_location.py | 109 +++++++++++++++++- 2 files changed, 200 insertions(+), 15 deletions(-) diff --git a/src/tir/analysis/buffer_access_lca_detector.cc b/src/tir/analysis/buffer_access_lca_detector.cc index 7197e1ba83c5..64d10fae2ff1 100644 --- a/src/tir/analysis/buffer_access_lca_detector.cc +++ b/src/tir/analysis/buffer_access_lca_detector.cc @@ -99,23 +99,32 @@ class LCADetector : public StmtExprVisitor { } ancestor_scopes_.push_back(current_scope); + loop_scope_map_.insert({op->loop_var.get(), current_scope}); StmtExprVisitor::VisitStmt_(op); ancestor_scopes_.pop_back(); + loop_scope_map_.erase(op->loop_var.get()); } - void VisitStmt_(const BlockNode* op) final { + void VisitStmt_(const BlockRealizeNode* op) final { + const BlockNode* block = op->block.get(); int n = ancestor_scopes_.size(); - for (const Buffer& buf : op->alloc_buffers) { + for (const Buffer& buf : block->alloc_buffers) { buffer_var_map_.emplace(buf->data.get(), buf.get()); } const ScopeInfo* parent_scope = ancestor_scopes_.back(); - auto* current_scope = arena_.make(parent_scope, op, n); + auto* current_scope = arena_.make(parent_scope, block, n); ancestor_scopes_.push_back(current_scope); + + // For each accessed buffer of the block, update the buffer's lca to + // the lowest inclusive stmt position, which should dominate all loops + // related to the accessed opaque block iter vars in buffer indices. + UpdateDominateScopeOfOpaqueIter(op); + // Update match_buffers - for (const MatchBufferRegion& match_buffer : op->match_buffers) { - UpdateBufferLCA(match_buffer->source->buffer.get()); + for (const MatchBufferRegion& match_buffer : block->match_buffers) { + UpdateBufferLCA(match_buffer->source->buffer.get(), ancestor_scopes_.back()); match_buffers_.insert(match_buffer->buffer.get()); } @@ -123,6 +132,80 @@ class LCADetector : public StmtExprVisitor { ancestor_scopes_.pop_back(); } + void UpdateDominateScopeOfOpaqueIter(const BlockRealizeNode* block_realize) { + // map opaque iter var to the scope which dominate all loop carried dependencies. + std::unordered_map itervar_to_dom_scope; + + // function to collect `itervar_to_dom_scope`, the result scope for each block + // iter var should be above all loop scopes the opaque iter var binding relates to. + auto do_collect_itervar_scope = [this, &itervar_to_dom_scope](const IterVar& itervar, + const PrimExpr& binding) { + PostOrderVisit(binding, [this, &itervar_to_dom_scope, &itervar](const ObjectRef& obj) { + if (const VarNode* loop_var = obj.as()) { + auto it = loop_scope_map_.find(loop_var); + if (it == loop_scope_map_.end()) { + return; + } + const ScopeInfo* scope = it->second->parent_scope_info; + // find the highest loop scope the iter var binding has related to. + auto dom_scope_it = itervar_to_dom_scope.find(itervar->var.get()); + if (dom_scope_it == itervar_to_dom_scope.end()) { + itervar_to_dom_scope.insert(dom_scope_it, {itervar->var.get(), scope}); + } else if (scope->depth < dom_scope_it->second->depth) { + dom_scope_it->second = scope; + } + } + }); + }; + + // function to update lca scope of the buffer with loop carried dependent buffer accesses. + // the result scope should be above all loop scopes the accessed opaque block iter vars + // relate to, which is record in `itervar_to_dom_scope`. + auto do_update = [this, &itervar_to_dom_scope](const BufferRegion& region) { + const Buffer& buffer = region->buffer; + const ScopeInfo* scope = ancestor_scopes_.back(); + + auto handle_itervar = [&itervar_to_dom_scope, &scope](const ObjectRef& obj) { + if (const VarNode* iter_var = obj.as()) { + auto dom_scope_it = itervar_to_dom_scope.find(iter_var); + if (dom_scope_it == itervar_to_dom_scope.end()) { + return; + } + // find the highest loop scope the accessed buffer index has + // loop carried dependencies to (via opaque iter var binding). + if (dom_scope_it->second->depth < scope->depth) { + scope = dom_scope_it->second; + } + } + }; + + // visit region min and max to find the lowest legal lca scope + for (const Range& range : region->region) { + PostOrderVisit(range->min, handle_itervar); + PostOrderVisit(range->min + range->extent - 1, handle_itervar); + } + UpdateBufferLCA(buffer.get(), scope); + }; + + // do collect and update + const Block& block = block_realize->block; + for (size_t i = 0; i < block_realize->iter_values.size(); ++i) { + const IterVar& iter_var = block->iter_vars[i]; + if (iter_var->iter_type != IterVarType::kDataPar && + iter_var->iter_type != IterVarType::kCommReduce) { + do_collect_itervar_scope(iter_var, block_realize->iter_values[i]); + } + } + if (!itervar_to_dom_scope.empty()) { + for (const auto& read : block->reads) { + do_update(read); + } + for (const auto& write : block->writes) { + do_update(write); + } + } + } + void VisitStmt_(const AttrStmtNode* op) final { if (op->attr_key == attr::thread_extent) { const auto* iter = op->node.as(); @@ -136,17 +219,18 @@ class LCADetector : public StmtExprVisitor { } void VisitExpr_(const BufferLoadNode* op) final { - UpdateBufferLCA(op->buffer.get()); + UpdateBufferLCA(op->buffer.get(), ancestor_scopes_.back()); StmtExprVisitor::VisitExpr_(op); } void VisitStmt_(const BufferStoreNode* op) final { - UpdateBufferLCA(op->buffer.get()); + UpdateBufferLCA(op->buffer.get(), ancestor_scopes_.back()); StmtExprVisitor::VisitStmt_(op); } void VisitStmt_(const BufferRealizeNode* op) final { buffer_var_map_.emplace(op->buffer->data.get(), op->buffer.get()); + UpdateBufferLCA(op->buffer.get(), ancestor_scopes_.back()); StmtExprVisitor::VisitStmt_(op); } @@ -165,16 +249,16 @@ class LCADetector : public StmtExprVisitor { void VisitBufferVar(const VarNode* op) { auto it = buffer_var_map_.find(op); if (it != buffer_var_map_.end()) { - UpdateBufferLCA(it->second); + UpdateBufferLCA(it->second, ancestor_scopes_.back()); } } - void UpdateBufferLCA(const BufferNode* buffer) { + void UpdateBufferLCA(const BufferNode* buffer, const ScopeInfo* scope) { buffer_var_map_.emplace(buffer->data.get(), buffer); if (match_buffers_.find(buffer) == match_buffers_.end()) { // Ingore buffer created by block match_buffer const ScopeInfo*& lca = buffer_lca_[buffer]; - lca = LowestCommonAncestor(lca, ancestor_scopes_.back()); + lca = LowestCommonAncestor(lca, scope); } } @@ -229,6 +313,8 @@ class LCADetector : public StmtExprVisitor { std::unordered_set match_buffers_ = {}; /*! \brief The ForNodes/BlockNodes which contain immediate `blockIdx` launch. */ std::vector blockidx_scopes_ = {}; + /*! \brief The map from loop var to the corresponding scope. */ + std::unordered_map loop_scope_map_ = {}; /*! \brief Internal arena. */ support::Arena arena_; }; diff --git a/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py b/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py index c22f5f82ee10..34d82f86a422 100644 --- a/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py +++ b/tests/python/unittest/test_tir_transform_plan_update_buffer_allocation_location.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. import tvm +import tvm.testing from tvm import te from tvm.script import tir as T @@ -242,9 +243,107 @@ def test_lower_te(): ) # PlanAndUpdateBufferAllocationLocation should do nothing on TE +def test_loop_carried_dependency(): + """The buffer allocation should be above opaque iter var's loop scopes + such that buffer accesses with loop carried dependencies are covered.""" + + @T.prim_func + def before(A: T.Buffer[(8, 8, 8), "int32"], B: T.Buffer[(8, 8, 8), "int32"]): + C = T.alloc_buffer([8, 8, 8], dtype="int32") + for i in T.serial(8): + for j in T.serial(8): + for k in T.serial(8): + with T.block("b0"): + vi, vj, vk = T.axis.remap("SSS", [i, j, k]) + C[vi, vj, vk] = A[vi, vj, vk] + 1 + for k in T.serial(8): + with T.block("b1"): + vi, vk = T.axis.remap("SS", [i, k]) + vj = T.axis.opaque(8, j) + B[vi, vj, vk] = C[vi, vj, vk] + T.if_then_else( + 0 < vj, C[vi, vj - 1, vk], 0, dtype="int32" + ) + + @T.prim_func + def after(A: T.Buffer[(8, 8, 8), "int32"], B: T.Buffer[(8, 8, 8), "int32"]) -> None: + for i in T.serial(8): + with T.block(): + T.reads(A[i, 0:8, 0:8]) + T.writes(B[i, 0:8, 0:8]) + C = T.alloc_buffer([8, 8, 8], dtype="int32") + for j in T.serial(8): + for k in T.serial(8): + with T.block("b0"): + vi, vj, vk = T.axis.remap("SSS", [i, j, k]) + C[vi, vj, vk] = A[vi, vj, vk] + 1 + for k in T.serial(8): + with T.block("b1"): + vi, vk = T.axis.remap("SS", [i, k]) + vj = T.axis.opaque(8, j) + B[vi, vj, vk] = C[vi, vj, vk] + T.if_then_else( + 0 < vj, C[vi, vj - 1, vk], 0, dtype="int32" + ) + + _check(before, after) + + +def test_1D_cascade_op_rolling_buffer(): + """The intermediate buffer must be allocated above rolling buffer's rolling loop, + which is marked as opaque in consumer block's iter mappings.""" + + @T.prim_func + def before(A: T.Buffer[(4, 16), "int32"], C: T.Buffer[(4, 8), "int32"]): + B = T.alloc_buffer((4, 6), "int32") + for c in T.serial(4): + for i in T.serial(0, 2): + for j in T.serial(0, 6): + for k in T.serial(3): + with T.block("P1"): + T.where(i < 1 or j >= 2) + cc, vi, vj, vk = T.axis.remap("SSSR", [c, i, j, k]) + if vk == 0: + B[cc, T.floormod(vi * 4 + vj, 6)] = 0 + B[cc, T.floormod(vi * 4 + vj, 6)] = ( + B[cc, T.floormod(vi * 4 + vj, 6)] + A[cc, vi * 4 + vj + vk] + ) + for j in T.serial(0, 4): + for k in T.serial(3): + with T.block("P2"): + vi = T.axis.opaque(2, i) + cc, vj, vk = T.axis.remap("SSR", [c, j, k]) + if vk == 0: + C[cc, vi * 4 + vj] = 0 + C[cc, vi * 4 + vj] = ( + C[cc, vi * 4 + vj] + B[cc, T.floormod(vi * 4 + vj + vk, 6)] + ) + + @T.prim_func + def after(A: T.Buffer[(4, 16), "int32"], C: T.Buffer[(4, 8), "int32"]): + for c in T.serial(4): + with T.block(): + T.reads(A[c, 0:12], C[c, 0:8]) + T.writes(C[c, 0:8]) + B = T.alloc_buffer([4, 6], dtype="int32") + for i in T.serial(2): + for j, k in T.grid(6, 3): + with T.block("P1"): + T.where(i < 1 or j >= 2) + cc, vi, vj, vk = T.axis.remap("SSSR", [c, i, j, k]) + if vk == 0: + B[cc, (vi * 4 + vj) % 6] = 0 + B[cc, (vi * 4 + vj) % 6] = ( + B[cc, (vi * 4 + vj) % 6] + A[cc, vi * 4 + vj + vk] + ) + for j, k in T.grid(4, 3): + with T.block("P2"): + vi = T.axis.opaque(2, i) + cc, vj, vk = T.axis.remap("SSR", [c, j, k]) + if vk == 0: + C[cc, vi * 4 + vj] = 0 + C[cc, vi * 4 + vj] = C[cc, vi * 4 + vj] + B[cc, (vi * 4 + vj + vk) % 6] + + _check(before, after) + + if __name__ == "__main__": - test_elementwise() - test_locate_buffer_allocation() - test_match_buffer_allocation() - test_opaque_access() - test_lower_te() + tvm.testing.main() From c8423a6843edec5e85003a33d260f2214fd16c42 Mon Sep 17 00:00:00 2001 From: Yuanjing Shi Date: Sun, 25 Sep 2022 22:50:20 -0700 Subject: [PATCH 241/704] [Meta Schedule][XGBoost] Update the custom callback function of xgboost in meta schedule (#12141) * update the custom callback function of xgboost * fix lint * fix ci * fix lint * add unit test * remote unused code * fix lint * add decorator * address comment * fix lint * address comments * fix mypy * fix lint * remove unused comments * address comments * Fix xgboost unit test import. Co-authored-by: Xiyou Zhou --- .../tvm/meta_schedule/cost_model/xgb_model.py | 169 +++++++++++------- .../unittest/test_meta_schedule_cost_model.py | 85 +++++++++ 2 files changed, 194 insertions(+), 60 deletions(-) diff --git a/python/tvm/meta_schedule/cost_model/xgb_model.py b/python/tvm/meta_schedule/cost_model/xgb_model.py index 8de034758b4b..1171e081b90a 100644 --- a/python/tvm/meta_schedule/cost_model/xgb_model.py +++ b/python/tvm/meta_schedule/cost_model/xgb_model.py @@ -35,7 +35,26 @@ from ..utils import cpu_count, derived_object, shash2hex from .metric import max_curve + +def optional_xgboost_callback(cls): + """Decorator for importing TraningCallback from xgboost""" + # pylint:disable = import-outside-toplevel + try: + from xgboost.callback import TrainingCallback # type: ignore + # pylint:enable = import-outside-toplevel + except ImportError: + + class TrainingCallback: # type: ignore + pass + + class OptXGBoostCustomCallback(cls, TrainingCallback): # type: ignore + pass + + return OptXGBoostCustomCallback + + if TYPE_CHECKING: + import xgboost as xgb # type: ignore from ..tune_context import TuneContext @@ -579,14 +598,12 @@ def avg_peak_score(ys_pred: np.ndarray, d_train: "xgb.DMatrix"): # type: ignore num_boost_round=10000, obj=obj, callbacks=[ - custom_callback( + XGBoostCustomCallback( early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose_eval, - fevals=[ - rmse, - avg_peak_score, - ], + fevals=[rmse, avg_peak_score], evals=[(self.d_train.dmatrix, "tr")], + cvfolds=None, ) ], ) @@ -640,52 +657,83 @@ def average_peak_score(ys_pred: np.ndarray): return eval_result -def custom_callback( - early_stopping_rounds: int, - verbose_eval: int, - fevals: List[Callable], - evals: List[Tuple["xgb.DMatrix", str]], - focused_metric: str = "tr-p-rmse", -): - """Callback function for xgboost to support multiple custom evaluation functions""" - sort_key = make_metric_sorter(focused_metric=focused_metric) - - state: Dict[str, Any] = {} - - def init(env: "xgb.core.CallbackEnv"): - """Internal function""" - booster: "xgb.Booster" = env.model +@optional_xgboost_callback +class XGBoostCustomCallback: + """Custom callback class for xgboost to support multiple custom evaluation functions""" - state["best_iteration"] = 0 - state["best_score"] = float("inf") + def __init__( + self, + early_stopping_rounds: int, + verbose_eval: int, + fevals: List[Callable], + evals: List[Tuple["xgb.DMatrix", str]], + focused_metric: str = "tr-p-rmse", + cvfolds: List["xgb.training.CVPack"] = None, + ): + self.early_stopping_rounds = early_stopping_rounds + self.verbose_eval = verbose_eval + self.fevals = fevals + self.evals = evals + self.state: Dict[str, Any] = {} + self.focused_metric = focused_metric + self.sort_key = make_metric_sorter(focused_metric=focused_metric) + self.cvfolds = cvfolds + if cvfolds is not None: + self.aggregated_cv = None + + def __call__(self, env: "xgb.core.CallbackEnv"): + # Compatibility with xgboost < 1.3 + return self.after_iteration(env.model, env.iteration, env.evaluation_result_list) + + def init(self, model: "xgb.Booster"): + """Internal function for intialization""" + booster: "xgb.Booster" = model + self.state["best_iteration"] = 0 + self.state["best_score"] = float("inf") if booster is None: - assert env.cvfolds is not None + assert self.cvfolds is not None return if booster.attr("best_score") is not None: - state["best_score"] = float(booster.attr("best_score")) - state["best_iteration"] = int(booster.attr("best_iteration")) - state["best_msg"] = booster.attr("best_msg") + self.state["best_score"] = float(booster.attr("best_score")) + self.state["best_iteration"] = int(booster.attr("best_iteration")) + self.state["best_msg"] = booster.attr("best_msg") else: - booster.set_attr(best_iteration=str(state["best_iteration"])) - booster.set_attr(best_score=str(state["best_score"])) + booster.set_attr(best_iteration=str(self.state["best_iteration"])) + booster.set_attr(best_score=str(self.state["best_score"])) - def callback(env: "xgb.core.CallbackEnv"): + def after_iteration( + self, model: "xgb.Booster", epoch: int, evals_log: Dict + ): # pylint: disable = unused-argument + """Internal function for after_iteration""" # pylint:disable = import-outside-toplevel + try: + from xgboost.callback import _fmt_metric # type: ignore + except ImportError: + # Compatibility with xgboost >= 1.6 + + def _fmt_metric(value, show_stdv=True): + if len(value) == 2: + return f"{value[0]}:{value[1]:.5f}" + if len(value) == 3: + if show_stdv: + return f"{value[0]}:{value[1]:.5f}+{value[2]:.5f}" + return f"{value[0]}:{value[1]:.5f}" + raise ValueError("wrong metric value", value) + import xgboost as xgb - from xgboost.callback import _fmt_metric # type: ignore - from xgboost.core import EarlyStopException # type: ignore + from xgboost import rabit # type: ignore try: from xgboost.training import aggcv # type: ignore except ImportError: from xgboost.callback import _aggcv as aggcv # type: ignore - # pylint:enable = import-outside-toplevel - if not state: - init(env) - booster: xgb.Booster = env.model - iteration: int = env.iteration - cvfolds: List[xgb.training.CVPack] = env.cvfolds + # pylint:enable = import-outside-toplevel + if not self.state: + self.init(model) + booster: xgb.Booster = model + iteration: int = epoch + cvfolds: List[xgb.training.CVPack] = self.cvfolds ##### Evaluation ##### # `eval_result` is a list of (key, score) eval_result: List[Tuple[str, float]] = [] @@ -697,13 +745,13 @@ def callback(env: "xgb.core.CallbackEnv"): for key, value in map( lambda x: x.split(":"), booster.eval_set( - evals=evals, + evals=self.evals, iteration=iteration, feval=feval, ).split()[1:], ) ] - for feval in fevals + for feval in self.fevals ) ) else: @@ -719,14 +767,14 @@ def callback(env: "xgb.core.CallbackEnv"): for fold in cvfolds ) ] - for feval in fevals + for feval in self.fevals ) ) eval_result = list(eval_result) - eval_result.sort(key=sort_key) + eval_result.sort(key=self.sort_key) ##### Print eval result ##### - if verbose_eval and iteration % verbose_eval == 0: + if self.verbose_eval and iteration % self.verbose_eval == 0: info = [] for key, score in eval_result: if "null" not in key: @@ -736,30 +784,31 @@ def callback(env: "xgb.core.CallbackEnv"): ##### Choose score and do early stopping ##### score = None for key, _score in eval_result: - if key == focused_metric: + if key == self.focused_metric: score = _score break assert score is not None - best_score = state["best_score"] - best_iteration = state["best_iteration"] + best_score = self.state["best_score"] + best_iteration = self.state["best_iteration"] if score < best_score: tab = "\t" # to work with f-string - msg = f"[{env.iteration}] {tab.join([_fmt_metric(x) for x in eval_result])}" - state["best_msg"] = msg - state["best_score"] = score - state["best_iteration"] = env.iteration + msg = f"[{epoch}] {tab.join([_fmt_metric(x) for x in eval_result])}" + self.state["best_msg"] = msg + self.state["best_score"] = score + self.state["best_iteration"] = epoch # save the property to attributes, so they will occur in checkpoint. - if env.model is not None: - env.model.set_attr( - best_score=str(state["best_score"]), - best_iteration=str(state["best_iteration"]), - best_msg=state["best_msg"], + if model is not None: + model.set_attr( + best_score=str(self.state["best_score"]), + best_iteration=str(self.state["best_iteration"]), + best_msg=self.state["best_msg"], ) - elif env.iteration - best_iteration >= early_stopping_rounds: - best_msg = state["best_msg"] - if verbose_eval and env.rank == 0: - logger.debug("XGB stopped. Best iteration: %s ", best_msg) - raise EarlyStopException(best_iteration) + elif epoch - best_iteration >= self.early_stopping_rounds: + best_msg = self.state["best_msg"] - return callback + if self.verbose_eval and rabit.get_rank() == 0: + logger.debug("XGB stopped. Best iteration: %s ", best_msg) + return True # instead of raising EarlyStopException, returning True to end the training + # False to indicate training should not stop. + return False diff --git a/tests/python/unittest/test_meta_schedule_cost_model.py b/tests/python/unittest/test_meta_schedule_cost_model.py index d1d558181324..94b7bce246f4 100644 --- a/tests/python/unittest/test_meta_schedule_cost_model.py +++ b/tests/python/unittest/test_meta_schedule_cost_model.py @@ -27,6 +27,7 @@ import tvm import tvm.testing from tvm.meta_schedule.cost_model import PyCostModel, RandomModel, XGBModel +from tvm.meta_schedule.cost_model.xgb_model import XGBoostCustomCallback, PackSum from tvm.meta_schedule.feature_extractor import RandomFeatureExtractor from tvm.meta_schedule.runner import RunnerResult from tvm.meta_schedule.search_strategy import MeasureCandidate @@ -228,5 +229,89 @@ def test_meta_schedule_xgb_model_reupdate(): model.predict(TuneContext(), [_dummy_candidate() for i in range(predict_sample_count)]) +def test_meta_schedule_xgb_model_callback(): + import xgboost as xgb + from itertools import chain as itertools_chain + from functools import partial + + extractor = RandomFeatureExtractor() + model = XGBModel(extractor=extractor, num_warmup_samples=10) + update_sample_count = 20 + predict_sample_count = 30 + + model.update( + TuneContext(), + [_dummy_candidate() for i in range(update_sample_count)], + [_dummy_result() for i in range(update_sample_count)], + ) + model.predict(TuneContext(), [_dummy_candidate() for i in range(predict_sample_count)]) + with tempfile.NamedTemporaryFile() as path: + # Backup and train on new TrainingCallBack api + random_state = model.extractor.random_state # save feature extractor's random state + + model.save(path.name) + + old_booster = model.booster + xs = [ + x.numpy().astype("float32") + for x in extractor.extract_from( + TuneContext(), + [_dummy_candidate() for i in range(predict_sample_count)], + ) + ] + d_test = PackSum(xs=xs, ys=None) + pred1 = old_booster.predict(d_test.dmatrix) + + # Load and train on deprecated TrainingCallBack api + model.extractor.random_state = random_state # load feature extractor's random state + model.load(path.name) + d_train = PackSum( + xs=list(itertools_chain.from_iterable([g.features for g in model.data.values()])), + ys=np.concatenate( + [g.min_cost / g.costs for g in model.data.values()], + axis=0, + ), + ) + + def obj(ys_pred: np.ndarray, d_train1: "xgb.DMatrix"): # type: ignore # pylint: disable = unused-argument + return d_train.obj_square_error(ys_pred) + + def rmse(ys_pred: np.ndarray, d_train1: "xgb.DMatrix"): # type: ignore # pylint: disable = unused-argument + return d_train.rmse(ys_pred) + + def avg_peak_score(ys_pred: np.ndarray, d_train1: "xgb.DMatrix"): # type: ignore # pylint: disable = unused-argument + return d_train.average_peak_score(ys_pred, model.average_peak_n) + + new_booster = xgb.train( + model.config.to_dict(), + d_train.dmatrix, + num_boost_round=10000, + obj=obj, + callbacks=[ + partial( + XGBoostCustomCallback( + early_stopping_rounds=model.early_stopping_rounds, + verbose_eval=model.verbose_eval, + fevals=[rmse, avg_peak_score], + evals=[(d_train.dmatrix, "tr")], + cvfolds=None, + ) + ) + ], + ) + + xs = [ + x.numpy().astype("float32") + for x in extractor.extract_from( + TuneContext(), + [_dummy_candidate() for i in range(predict_sample_count)], + ) + ] + d_test = PackSum(xs=xs, ys=None) + pred2 = new_booster.predict(d_test.dmatrix) + + assert np.allclose(pred1, pred2, rtol=1e-3, atol=1e-3) + + if __name__ == "__main__": tvm.testing.main() From 46ea2ed42ee41225141c5ed522900d340b08944d Mon Sep 17 00:00:00 2001 From: Xiyou Zhou Date: Sun, 25 Sep 2022 22:50:51 -0700 Subject: [PATCH 242/704] [MetaSchedule][UX] User Interface for Jupyter Notebook (#12866) * Add features for jupyter notebook. * Fix workload import warnings. * Enable output clearing for cli. * Fix test. * Fix lint. * Change to separate cleaning function. --- .../meta_schedule/testing/relay_workload.py | 18 +++---- python/tvm/meta_schedule/utils.py | 18 ++++++- .../task_scheduler/gradient_based.cc | 50 ++++++++++++++----- src/meta_schedule/utils.h | 32 ++++++++++++ .../unittest/test_meta_schedule_tune_relay.py | 18 +++---- 5 files changed, 104 insertions(+), 32 deletions(-) diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py index 98bb99512020..9dcff2ace583 100644 --- a/python/tvm/meta_schedule/testing/relay_workload.py +++ b/python/tvm/meta_schedule/testing/relay_workload.py @@ -61,23 +61,23 @@ def _get_network( assert layout is None or layout in ["NCHW", "NHWC"] if name in ["resnet_18", "resnet_50"]: - model = getattr(models, name.replace("_", ""))(pretrained=False) + model = getattr(models, name.replace("_", ""))(weights=None) elif name == "wide_resnet_50": - model = getattr(models, "wide_resnet50_2")(pretrained=False) + model = getattr(models, "wide_resnet50_2")(weights=None) elif name == "resnext_50": - model = getattr(models, "resnext50_32x4d")(pretrained=False) + model = getattr(models, "resnext50_32x4d")(weights=None) elif name == "mobilenet_v2": - model = getattr(models, name)(pretrained=False) + model = getattr(models, name)(weights=None) elif name == "mobilenet_v3": - model = getattr(models, name + "_large")(pretrained=False) + model = getattr(models, name + "_large")(weights=None) elif name == "inception_v3": - model = getattr(models, name)(pretrained=False, aux_logits=False) + model = getattr(models, name)(weights=None, aux_logits=False) elif name == "densenet_121": - model = getattr(models, name.replace("_", ""))(pretrained=False) + model = getattr(models, name.replace("_", ""))(weights=None) elif name == "resnet3d_18": - model = models.video.r3d_18(pretrained=False) + model = models.video.r3d_18(weights=None) elif name == "vgg_16": - model = getattr(models, name.replace("_", ""))(pretrained=False) + model = getattr(models, name.replace("_", ""))(weights=None) dtype = "float32" input_data = torch.randn(input_shape).type( # pylint: disable=no-member diff --git a/python/tvm/meta_schedule/utils.py b/python/tvm/meta_schedule/utils.py index 26bf20670955..7b7c4a68653d 100644 --- a/python/tvm/meta_schedule/utils.py +++ b/python/tvm/meta_schedule/utils.py @@ -371,11 +371,27 @@ def make_logging_func(logger: logging.Logger) -> Optional[Callable]: } def logging_func(level: int, msg: str): - level2log[level](msg) + def clear_notebook_output(): + from IPython.display import clear_output # type: ignore # pylint: disable=import-outside-toplevel + + clear_output(wait=True) + + if level < 0: + clear_notebook_output() + else: + level2log[level](msg) return logging_func +@register_func("meta_schedule.using_ipython") +def _check_ipython_env(): + try: + return get_ipython().__class__.__name__ == "ZMQInteractiveShell" # type: ignore + except NameError: + return False + + def parameterize_config(config: Dict[str, Any], params: Dict[str, str]) -> Dict[str, Any]: """Parameterize the given configuration. diff --git a/src/meta_schedule/task_scheduler/gradient_based.cc b/src/meta_schedule/task_scheduler/gradient_based.cc index 73d191f593fe..506bb620e1d8 100644 --- a/src/meta_schedule/task_scheduler/gradient_based.cc +++ b/src/meta_schedule/task_scheduler/gradient_based.cc @@ -61,22 +61,43 @@ class GradientBasedNode final : public TaskSchedulerNode { int total_trials = 0; double total_latency = 0.0; support::TablePrinter p; - p.Row() << "ID" - << "Name" - << "FLOP" - << "Weight" - << "Speed (GFLOPS)" - << "Latency (us)" - << "Weighted Latency (us)" - << "Trials" - << "Terminated"; + + if (using_ipython()) { + p.Row() << "ID" + << "Name" + << "FLOP" + << "Weight" + << "GFLOPS" + << "Latency (us)" + << "Wtd. Latency" + << "Trials" + << "Terminated"; + } else { + p.Row() << "ID" + << "Name" + << "FLOP" + << "Weight" + << "Speed (GFLOPS)" + << "Latency (us)" + << "Weighted Latency (us)" + << "Trials" + << "Terminated"; + } + p.Separator(); + for (int i = 0; i < n_tasks; ++i) { const TaskRecord& record = task_records_[i]; auto row = p.Row(); int trials = record.trials; + String task_name = record.task->task_name.value(); + if (using_ipython() && task_name.length() > 23) { + std::string temp = task_name.c_str(); + temp = temp.substr(0, 20) + "..."; + task_name = String(temp); + } row << /*id=*/i // - << /*name=*/record.task->task_name.value() // + << /*name=*/task_name // << /*flops=*/static_cast(record.flop) // << /*weight=*/static_cast(record.weight); double latency = 1e9; @@ -101,9 +122,10 @@ class GradientBasedNode final : public TaskSchedulerNode { } } p.Separator(); - os << p.AsStr() // - << "\nTotal trials: " << total_trials // - << "\nTotal latency (us): " << total_latency // + os << p.AsStr() // + << "\nProgress: " << total_trials / (max_trials * 0.01) << "%" // + << "\nTotal Trials: " << total_trials << " / " << max_trials // + << "\nTotal latency (us): " << total_latency // << "\n"; return os.str(); } @@ -112,6 +134,7 @@ class GradientBasedNode final : public TaskSchedulerNode { int n_tasks = task_records_.size(); // Round robin if (num_rounds_already_ == 0) { + TVM_PY_LOG_CLEAR_SCREEN(this->logging_func); TVM_PY_LOG(INFO, this->logging_func) << "\n" << this->TuningStatistics(); } if (num_rounds_already_ < n_tasks) { @@ -178,6 +201,7 @@ class GradientBasedNode final : public TaskSchedulerNode { } record.best_time_cost_history.push_back(best_time_cost); record.trials += results.size(); + TVM_PY_LOG_CLEAR_SCREEN(this->logging_func); TVM_PY_LOG(INFO, this->logging_func) << "[Updated] Task #" << task_id << ": " << record.task->task_name << "\n" << this->TuningStatistics(); diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h index cf9a32917031..f0b736081670 100644 --- a/src/meta_schedule/utils.h +++ b/src/meta_schedule/utils.h @@ -59,6 +59,7 @@ ::tvm::meta_schedule::PyLogMessage(__FILE__, __LINE__, logging_func, \ PyLogMessage::Level::logging_level) \ .stream() +#define TVM_PY_LOG_CLEAR_SCREEN(logging_func) clear_logging(__FILE__, __LINE__, logging_func) namespace tvm { namespace meta_schedule { @@ -66,10 +67,13 @@ namespace meta_schedule { /*! * \brief Class to accumulate an log message on the python side. Do not use directly, instead use * TVM_PY_LOG(DEBUG), TVM_PY_LOG(INFO), TVM_PY_LOG(WARNING), TVM_PY_ERROR(ERROR). + * \sa TVM_PY_LOG + * \sa TVM_PY_LOG_CLEAR_SCREEN */ class PyLogMessage { public: enum class Level : int32_t { + CLEAR = -10, DEBUG = 10, INFO = 20, WARNING = 30, @@ -81,6 +85,8 @@ class PyLogMessage { : file_(file), lineno_(lineno), logging_func_(logging_func), logging_level_(logging_level) {} TVM_NO_INLINE ~PyLogMessage() { + ICHECK(logging_level_ != Level::CLEAR) + << "Cannot use CLEAR as logging level in TVM_PY_LOG, please use TVM_PY_LOG_CLEAR_SCREEN."; if (this->logging_func_.defined()) { logging_func_(static_cast(logging_level_), stream_.str()); } else { @@ -107,6 +113,32 @@ class PyLogMessage { Level logging_level_; }; +/*! + * \brief Whether the tuning is running on ipython kernel. + * \return A boolean indicating whether ipython kernel is used. + */ +inline bool using_ipython() { + bool flag = false; + const auto* f_using_ipython = runtime::Registry::Get("meta_schedule.using_ipython"); + if (f_using_ipython->defined()) flag = (*f_using_ipython)(); + return flag; +} + +/*! + * \brief A helper function to clear logging output for ipython kernel and console. + * \param file The file name. + * \param lineno The line number. + * \param logging_func The logging function. + */ +inline void clear_logging(const char* file, int lineno, PackedFunc logging_func) { + if (logging_func.defined() && using_ipython()) { + logging_func(static_cast(PyLogMessage::Level::CLEAR), ""); + } else { + // this would clear all logging output in the console + runtime::detail::LogMessage(file, lineno).stream() << "\033c\033[3J\033[2J\033[0m\033[H"; + } +} + /*! \brief The type of the random state */ using TRandState = support::LinearCongruentialEngine::TRandState; diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py index 0267352fd697..5cc4f8f6a404 100644 --- a/tests/python/unittest/test_meta_schedule_tune_relay.py +++ b/tests/python/unittest/test_meta_schedule_tune_relay.py @@ -115,11 +115,11 @@ def main(placeholder: T.Buffer[(1, 2, 16, 16, 4), "float32"], T_layout_trans: T. @pytest.mark.parametrize( "model_name, input_shape, target, layout", [ - ("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16", "NHWC"), + ("resnet_18", [1, 3, 224, 224], "llvm --num-cores=12", "NHWC"), ("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NHWC"), - ("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=16", "NHWC"), + ("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=12", "NHWC"), ("mobilenet_v2", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NHWC"), - ("bert_base", [1, 64], "llvm --num-cores=16", None), + ("bert_base", [1, 64], "llvm --num-cores=12", None), ("bert_base", [1, 64], "nvidia/geforce-rtx-3070", None), ], ) @@ -242,7 +242,7 @@ def print_results(self) -> None: input_name = "data" dev = tvm.cpu() - target = Target("llvm --num-cores=16") + target = Target("llvm --num-cores=12") data = tvm.nd.array(data_sample, dev) database = TestDummyDatabase() @@ -250,7 +250,7 @@ def print_results(self) -> None: database.commit_workload(tvmgen_default_fused_layout_transform_1) database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc) - with database, tvm.transform.PassContext( + with database, tvm.transform.PassContext( # pylint: disable=not-context-manager opt_level=3, config={"relay.backend.use_meta_schedule": True}, ): @@ -295,7 +295,7 @@ def test_meta_schedule_relay_lowering(): input_name = "data" dev = tvm.cpu() - target = Target("llvm --num-cores=16") + target = Target("llvm --num-cores=12") data = tvm.nd.array(data_sample, dev) with tempfile.TemporaryDirectory() as work_dir: @@ -542,11 +542,11 @@ def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV): if __name__ == """__main__""": - test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16", None) + test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "llvm --num-cores=12", None) test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", "NCHW") - test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=16", None) + test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "llvm --num-cores=12", None) test_meta_schedule_tune_relay("mobilenet_v2", [1, 3, 224, 224], "nvidia/geforce-rtx-3070", None) - test_meta_schedule_tune_relay("bert_base", [1, 64], "llvm --num-cores=16", None) + test_meta_schedule_tune_relay("bert_base", [1, 64], "llvm --num-cores=12", None) test_meta_schedule_tune_relay("bert_base", [1, 64], "nvidia/geforce-rtx-3070", None) test_meta_schedule_te2primfunc_argument_order() test_meta_schedule_relay_lowering() From cc6e01edc6390de19c72f9283a6d4fa178672836 Mon Sep 17 00:00:00 2001 From: chengven027-intellif Date: Mon, 26 Sep 2022 17:13:22 +0800 Subject: [PATCH 243/704] [frontend][pytorch]support aten::zero_ operator (#12872) support aten::zero_ operator --- python/tvm/relay/frontend/pytorch.py | 5 +++++ tests/python/frontend/pytorch/test_forward.py | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index e35e23b3381c..c1bf69502ba8 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -738,6 +738,10 @@ def zeros(self, inputs, input_types): dtype = self.default_dtype return self.full_impl(data, 0, dtype) + def zero_(self, inputs, input_types): + data = inputs[0] + return self.full_impl(self.infer_shape(data), 0, input_types[0]) + def zeros_like(self, inputs, input_types): data = inputs[0] out = _op.zeros_like(data) @@ -3462,6 +3466,7 @@ def create_convert_map(self): "aten::ones": self.ones, "aten::ones_like": self.ones_like, "aten::zeros": self.zeros, + "aten::zero_": self.zero_, "aten::zeros_like": self.zeros_like, "aten::new_ones": self.new_ones, "aten::full": self.full, diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 5236b763faf0..33c70a4d74a4 100755 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -3257,6 +3257,13 @@ def forward(self, *args): verify_model(Zeros1().float().eval(), input_data=[]) +def test_forward_zero_(): + def test_func(x): + return x.zero_() + + verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()]) + + @tvm.testing.uses_gpu def test_forward_zeros_like(): """test_forward_zeros_like""" From 87085b0e0dad2a422993472e35431d4f22fd69d8 Mon Sep 17 00:00:00 2001 From: chengven027-intellif Date: Mon, 26 Sep 2022 17:14:33 +0800 Subject: [PATCH 244/704] [frontend][pytorch]Support aten::Tensor_split operator (#12871) Support aten::Tensor_split operator --- python/tvm/relay/frontend/pytorch.py | 54 +++++++++++++++++++ tests/python/frontend/pytorch/test_forward.py | 22 ++++++++ 2 files changed, 76 insertions(+) diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index c1bf69502ba8..1b86b120dfcc 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -559,6 +559,59 @@ def split_with_sizes(self, inputs, input_types): return _op.split(data, indices, dim) + def tensor_split(self, inputs, input_types): + # Reference: https://pytorch.org/docs/stable/generated/torch.tensor_split.html + import torch + + if not isinstance(inputs[1], (int, list, tuple, torch.Tensor)): + msg = "indices_or_sections type %s could not be parsed in tensor_split op" % ( + type(inputs[1]) + ) + raise AssertionError(msg) + + if isinstance(inputs[1], torch.Tensor) and not ( + list(inputs[1].shape) == [] or list(inputs[1].shape) == 1 + ): + msg = "indices_or_sections must be a zero-dimensional or one-dimensional long tensor" + raise AssertionError(msg) + + if isinstance(inputs[1], int) or ( + isinstance(inputs[1], torch.Tensor) and list(inputs[1].shape) == [] + ): + data = inputs[0] + n = int(inputs[1]) + dim = int(inputs[2]) + + split_size = int(self.infer_shape(data)[dim] / n) + split_rest = int(self.infer_shape(data)[dim] % n) + + indices = [] + split_index = split_size + if split_rest == 0: + for i in range(n - 1): + indices.append(split_index) + split_index += split_size + else: + for i in range(split_rest): + indices.append(split_index + 1) + split_index = (i + 1) * (split_index + 1) + for i in range(n - split_rest - 1): + split_index += split_size + indices.append(split_index) + + return _op.split(data, indices, dim) + else: + data = inputs[0] + sections = inputs[1] + dim = int(inputs[2]) + + if isinstance(sections, tuple): + sections = list(sections) + elif isinstance(sections, torch.Tensor): + sections = sections.cpu().numpy().tolist() + + return _op.split(data, sections, dim) + def select(self, inputs, input_types): data = inputs[0] dim = int(inputs[1]) @@ -3484,6 +3537,7 @@ def create_convert_map(self): "aten::slice": self.slice, "aten::narrow": self.narrow, "aten::split": self.split, + "aten::tensor_split": self.tensor_split, "aten::split_with_sizes": self.split_with_sizes, "aten::select": self.select, "aten::take": self.take, diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 33c70a4d74a4..3c8bd5efd80d 100755 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -959,6 +959,28 @@ def forward(self, *args): verify_model(Split([2, 3, 5], 1).float().eval(), input_data=input_data) +@tvm.testing.uses_gpu +def test_forward_tensor_split(): + """test_forward_tensor_split""" + torch.set_grad_enabled(False) + input_shape = [4, 10] + + class Tensor_Split(Module): + def __init__(self, split_size_or_sections, dim): + super().__init__() + self.split_size_or_sections = split_size_or_sections + self.dim = dim + + def forward(self, *args): + return torch.tensor_split(args[0], self.split_size_or_sections, self.dim) + + input_data = torch.rand(input_shape).float() + verify_model(Tensor_Split(2, 0).float().eval(), input_data=input_data) + verify_model(Tensor_Split(torch.tensor(3), 1).float().eval(), input_data=input_data) + verify_model(Tensor_Split([2, 3, 5], 1).float().eval(), input_data=input_data) + verify_model(Tensor_Split((2, 3, 5), 1).float().eval(), input_data=input_data) + + @tvm.testing.uses_gpu def test_forward_avgpool1d(): """test_forward_avgpool1d""" From 4ef1465d409655322cbeacbbb1b64e7791b7bf8a Mon Sep 17 00:00:00 2001 From: Florin Blanaru Date: Mon, 26 Sep 2022 13:37:33 +0300 Subject: [PATCH 245/704] [skip ci] Temporarily disable comments bot (#12903) [skip ci] Disable comment bot --- .github/{workflows => disabled_workflows}/pr_comment_bot.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/{workflows => disabled_workflows}/pr_comment_bot.yml (100%) diff --git a/.github/workflows/pr_comment_bot.yml b/.github/disabled_workflows/pr_comment_bot.yml similarity index 100% rename from .github/workflows/pr_comment_bot.yml rename to .github/disabled_workflows/pr_comment_bot.yml From b6a660be5860a851725b417565ecf71bfa343bc7 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Mon, 26 Sep 2022 09:38:43 -0700 Subject: [PATCH 246/704] [BUILD] Re-enable ccache by default (#12839) * [BUILD] Re-enable ccache by default Previously ccache was disabled because of possible issues with hexagon. Re-enabling it to provide a best effort attempt at using it. * set tvm_option, set variables correctly * clean up comment, fatal error if launcher is defined with USE_CCACHE=ON * add ccache to libinfo * more libinfo * add launcher to summary, move ccache to seperate file * Update cmake/utils/Summary.cmake Co-authored-by: driazati <9407960+driazati@users.noreply.github.com> * correct name for Summary.cmake Co-authored-by: driazati <9407960+driazati@users.noreply.github.com> --- CMakeLists.txt | 6 +++++ cmake/config.cmake | 11 ++++++++ cmake/modules/LibInfo.cmake | 1 + cmake/utils/CCache.cmake | 52 ++++++++++++++++++++++++++++++++++++ cmake/utils/Summary.cmake | 1 + docs/install/from_source.rst | 2 ++ src/support/libinfo.cc | 1 + 7 files changed, 74 insertions(+) create mode 100644 cmake/utils/CCache.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c355238b8c8..188f9fb1c7a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,7 @@ tvm_option(USE_PAPI "Use Performance Application Programming Interface (PAPI) to tvm_option(USE_GTEST "Use GoogleTest for C++ sanity tests" AUTO) tvm_option(USE_CUSTOM_LOGGING "Use user-defined custom logging, tvm::runtime::detail::LogFatalImpl and tvm::runtime::detail::LogMessageImpl must be implemented" OFF) tvm_option(USE_ALTERNATIVE_LINKER "Use 'mold' or 'lld' if found when invoking compiler to link artifact" AUTO) +tvm_option(USE_CCACHE "Use ccache if found when invoking compiler" AUTO) # 3rdparty libraries tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include") @@ -460,6 +461,11 @@ if(USE_PIPELINE_EXECUTOR) list(APPEND RUNTIME_SRCS ${RUNTIME_PIPELINE_SRCS}) endif(USE_PIPELINE_EXECUTOR) +# Caches the build. +# Note that ccache-3.x doesn't support nvcc well, so CUDA kernels may never hit the cache and still +# need to be re-compiled every time. Using ccache 4.0+ can resolve this issue. +include(cmake/utils/CCache.cmake) + # Module rules include(cmake/modules/VTA.cmake) include(cmake/modules/StandaloneCrt.cmake) diff --git a/cmake/config.cmake b/cmake/config.cmake index 18725de844b2..7067af42e9f1 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -352,6 +352,17 @@ set(USE_LIBBACKTRACE AUTO) # runtime functions to be unavailable to the program. set(BUILD_STATIC_RUNTIME OFF) +# Caches the build so that building is faster when switching between branches. +# If you switch branches, build and then encounter a linking error, you may +# need to regenerate the build tree through "make .." (the cache will +# still provide significant speedups). +# Possible values: +# - AUTO: search for path to ccache, disable if not found. +# - ON: enable ccache by searching for the path to ccache, report an error if not found +# - OFF: disable ccache +# - /path/to/ccache: use specific path to ccache +set(USE_CCACHE AUTO) + # Whether to enable PAPI support in profiling. PAPI provides access to hardware # counters while profiling. # Possible values: diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake index 6bc8f6b46390..73d3a9dbbe10 100644 --- a/cmake/modules/LibInfo.cmake +++ b/cmake/modules/LibInfo.cmake @@ -117,6 +117,7 @@ function(add_lib_info src_file) TVM_INFO_USE_CLML="${USE_CLML}" TVM_INFO_USE_CLML_GRAPH_EXECUTOR="${USE_CLML_GRAPH_EXECUTOR}" TVM_INFO_USE_UMA="${USE_UMA}" + TVM_INFO_USE_CCACHE="${USE_CCACHE}" ) endfunction() diff --git a/cmake/utils/CCache.cmake b/cmake/utils/CCache.cmake new file mode 100644 index 000000000000..f38a36b5dee8 --- /dev/null +++ b/cmake/utils/CCache.cmake @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if(USE_CCACHE) # True for AUTO, ON, /path/to/ccache + if(DEFINED CXX_COMPILER_LAUNCHER OR DEFINED C_COMPILER_LAUNCHER) + if("${USE_CCACHE}" STREQUAL "AUTO") + message(STATUS "CXX_COMPILER_LAUNCHER or C_COMPILER_LAUNCHER already defined, not using ccache") + elseif("${USE_CCACHE}" MATCHES ${IS_TRUE_PATTERN}) + message(FATAL_ERROR "CXX_COMPILER_LAUNCHER or C_COMPILER_LAUNCHER is already defined, refusing to override with ccache. Either unset or disable ccache.") + endif() + else() + if("${USE_CCACHE}" STREQUAL "AUTO") # Auto mode + find_program(CCACHE_FOUND "ccache") + if(CCACHE_FOUND) + message(STATUS "Found the path to ccache, enabling ccache") + set(PATH_TO_CCACHE "ccache") + else() + message(STATUS "Didn't find the path to CCACHE, disabling ccache") + endif(CCACHE_FOUND) + elseif("${USE_CCACHE}" MATCHES ${IS_TRUE_PATTERN}) + find_program(CCACHE_FOUND "ccache") + if(CCACHE_FOUND) + message(STATUS "Found the path to ccache, enabling ccache") + set(PATH_TO_CCACHE "ccache") + else() + message(FATAL_ERROR "Cannot find ccache. Set USE_CCACHE mode to AUTO or OFF to build without ccache. USE_CCACHE=" "${USE_CCACHE}") + endif(CCACHE_FOUND) + else() # /path/to/ccache + set(PATH_TO_CCACHE "${USE_CCACHE}") + message(STATUS "Setting ccache path to " "${PATH_TO_CCACHE}") + endif() + # Set the flag for ccache + if(DEFINED PATH_TO_CCACHE) + set(CXX_COMPILER_LAUNCHER "${PATH_TO_CCACHE}") + set(C_COMPILER_LAUNCHER "${PATH_TO_CCACHE}") + endif() + endif() +endif(USE_CCACHE) diff --git a/cmake/utils/Summary.cmake b/cmake/utils/Summary.cmake index 1b973f253a00..e3ea925a9ae1 100644 --- a/cmake/utils/Summary.cmake +++ b/cmake/utils/Summary.cmake @@ -42,6 +42,7 @@ macro(print_summary) message(STATUS " C++ compiler ID : ${CMAKE_CXX_COMPILER_ID}") message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}") message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS}") + message(STATUS " CXX launcher : ${CXX_COMPILER_LAUNCHER}") message(STATUS " Linker flags : ${CMAKE_SHARED_LINKER_FLAGS}") message(STATUS " Build type : ${CMAKE_BUILD_TYPE}") get_directory_property(READABLE_COMPILE_DEFS DIRECTORY ${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS) diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst index e5622b40a173..63d8aab33623 100644 --- a/docs/install/from_source.rst +++ b/docs/install/from_source.rst @@ -141,6 +141,8 @@ The configuration of TVM can be modified by editing `config.cmake` and/or by pas - On supported platforms, the `Ccache compiler wrapper `_ may be helpful for reducing TVM's build time. There are several ways to enable CCache in TVM builds: + - Leave `USE_CCACHE=AUTO` in `build/config.cmake`. CCache will be used if it is found. + - Ccache's Masquerade mode. This is typically enabled during the Ccache installation process. To have TVM use Ccache in masquerade, simply specify the appropriate C/C++ compiler paths when configuring TVM's build system. For example: diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc index 4b2f6034730d..a7d8e6a1ae2d 100644 --- a/src/support/libinfo.cc +++ b/src/support/libinfo.cc @@ -318,6 +318,7 @@ TVM_DLL Map GetLibInfo() { {"USE_CLML", TVM_INFO_USE_CLML}, {"USE_CLML_GRAPH_EXECUTOR", TVM_INFO_USE_CLML_GRAPH_EXECUTOR}, {"USE_UMA", TVM_INFO_USE_UMA}, + {"USE_CCACHE", TVM_INFO_USE_CCACHE}, }; return result; } From 8711ba44b9bebc54bb4bc3c3f456ee3ce3d40eed Mon Sep 17 00:00:00 2001 From: Yaxing Cai Date: Mon, 26 Sep 2022 09:52:02 -0700 Subject: [PATCH 247/704] [TVMScript] Import TIR methods into the IRBuilder (#12900) This PR introduces remaining TIR methods into IRBuilder Co-authored-by: yongwww --- include/tvm/script/ir_builder/tir/ir.h | 8 + python/tvm/script/ir_builder/tir/ir.py | 396 +++++++++++++++++- src/script/ir_builder/tir/ir.cc | 11 + .../unittest/test_tvmscript_ir_builder_tir.py | 15 + 4 files changed, 428 insertions(+), 2 deletions(-) diff --git a/include/tvm/script/ir_builder/tir/ir.h b/include/tvm/script/ir_builder/tir/ir.h index dd289b691502..7460099f9448 100644 --- a/include/tvm/script/ir_builder/tir/ir.h +++ b/include/tvm/script/ir_builder/tir/ir.h @@ -435,6 +435,14 @@ void Prefetch(Buffer buffer, Array bounds); */ void Evaluate(PrimExpr value); +/*! + * \brief The pointer declaration function. + * \param dtype The data type of the pointer. + * \param storage_scope The storage scope of the pointer. + * \return The pointer. + */ +PrimExpr Ptr(runtime::DataType dtype, String storage_scope = "global"); + #define TVM_TIR_IR_BUILDER_DEF_DTYPE_CAST(FuncName, DType) \ inline PrimExpr FuncName(Optional expr = NullOpt) { \ DataType dtype = DType; \ diff --git a/python/tvm/script/ir_builder/tir/ir.py b/python/tvm/script/ir_builder/tir/ir.py index 625e1291ff20..4ec1511f2907 100644 --- a/python/tvm/script/ir_builder/tir/ir.py +++ b/python/tvm/script/ir_builder/tir/ir.py @@ -17,24 +17,35 @@ # pylint: disable=missing-docstring """IRBuilder for TIR""" +import inspect +import functools from numbers import Integral -from typing import Any, Dict, List, Optional, Union, Tuple +from typing import Any, Callable, Dict, List, Optional, Union, Tuple import numpy as np # type: ignore from tvm.ir import Range, Type from tvm.runtime import convert, ndarray +from tvm.target.codegen import llvm_lookup_intrinsic_id from tvm.tir import ( Buffer, BufferLoad, BufferRegion, + Cast, + CommReducer, IntImm, IterVar, Let, PrimExpr, + Select, + Shuffle, StringImm, + type_annotation, Var, ) +from tvm.tir import Broadcast as broadcast from tvm.tir import Ramp as ramp +from tvm.tir import op as _tir_op +from tvm.tir.generic import cast from . import _ffi_api, frame @@ -1501,7 +1512,7 @@ def void(expr: Optional[PrimExpr] = None) -> PrimExpr: return _ffi_api.Void(expr) # type: ignore[attr-defined] # pylint: disable=no-member -def var(dtype, name="") -> Var: +def var(dtype: str, name: str = "") -> Var: """Construct a new tir.Var. Parameters @@ -1520,6 +1531,268 @@ def var(dtype, name="") -> Var: return Var(name, dtype) # pylint: disable=no-member +def ptr(dtype: str, storage_scope: str = "global") -> Var: + """The pointer declaration function. + + Parameters + ---------- + dtype : str + The data type of the pointer. + + storage_scope : str + The storage scope of the pointer. + + Returns + ------- + res : Var + The pointer. + """ + return _ffi_api.Ptr(dtype, storage_scope) # type: ignore[attr-defined] # pylint: disable=no-member + + +def min(a: PrimExpr, b: PrimExpr) -> PrimExpr: # pylint: disable=redefined-builtin + """Compute the minimum value of two expressions. + + Parameters + ---------- + a : PrimExpr + The left hand operand + + b : PrimExpr + The right hand operand + + Returns + ------- + res : PrimExpr + The result expression. + """ + return _ffi_api.min(a, b) # type: ignore[attr-defined] # pylint: disable=no-member + + +def max(a: PrimExpr, b: PrimExpr) -> PrimExpr: # pylint: disable=redefined-builtin + """Compute the maximum value of two expressions. + + Parameters + ---------- + a : PrimExpr + The left hand operand + + b : PrimExpr + The right hand operand + + Returns + ------- + res : PrimExpr + The result expression. + """ + return _ffi_api.max(a, b) # type: ignore[attr-defined] # pylint: disable=no-member + + +def iter_var(v: Union[Var, str], dom: Range, iter_type: str, thread_tag: str) -> IterVar: + """The iteration variable. + + Parameters + ---------- + var : Union[Var, str] + The internal variable that is used for iteration. + + dom : Range + The domain of the iteration. + + iter_type : str + The iteration type. + + thread_tag : str + The thread type tag. + + Returns + ------- + res : IterVar + The iteration variable. + """ + iter_type = getattr(IterVar, iter_type) + return IterVar(dom, v, iter_type, thread_tag) + + +def comm_reducer(combiner: Callable, identity: List[PrimExpr]) -> CommReducer: + """ + Create a CommReducer from lambda inputs/outputs and the identities + + Parameters + ---------- + combiner : Callable + A binary function which takes two PrimExpr as input to return a PrimExpr. + + identity : List[PrimExpr] + A list of types of output PrimExpr. + + Returns + ------- + res : CommReducer + The CommReducer. + """ + params = inspect.signature(combiner).parameters + num_args = len(params) + args = [] + for name, i in zip(params.keys(), identity + identity): + if isinstance(i, int): + args.append(Var(name, "int32")) + else: + args.append(Var(name, i.dtype)) + res = combiner(*args) + if not isinstance(res, tuple): + res = (res,) + return CommReducer(args[: num_args // 2], args[num_args // 2 :], res, identity) + + +def _op_wrapper(func): + @functools.wraps(func) + def wrapped(*args, **kwargs): + if "dtype" in kwargs: + kwargs.pop("dtype") + return func(*args, **kwargs) + + return wrapped + + +def _dtype_forward(func): + @functools.wraps(func) + def wrapped(*args, **kwargs): + if "dtype" in kwargs: + args = (kwargs.pop("dtype"),) + args + return func(*args, **kwargs) + + return wrapped + + +# pylint: disable=invalid-name + +buffer_var = ptr +abs = _op_wrapper(_tir_op.abs) # pylint: disable=redefined-builtin +fabs = abs +acos = _op_wrapper(_tir_op.acos) +acosh = _op_wrapper(_tir_op.acosh) +address_of = _op_wrapper(_tir_op.address_of) +asin = _op_wrapper(_tir_op.asin) +asinh = _op_wrapper(_tir_op.asinh) +atan = _op_wrapper(_tir_op.atan) +atan2 = _op_wrapper(_tir_op.atan2) +atanh = _op_wrapper(_tir_op.atanh) +ceil = _op_wrapper(_tir_op.ceil) +clz = _op_wrapper(_tir_op.clz) +copysign = _op_wrapper(_tir_op.copysign) +cos = _op_wrapper(_tir_op.cos) +cosh = _op_wrapper(_tir_op.cosh) +erf = _op_wrapper(_tir_op.erf) +exp = _op_wrapper(_tir_op.exp) +exp2 = _op_wrapper(_tir_op.exp2) +exp10 = _op_wrapper(_tir_op.exp10) +floor = _op_wrapper(_tir_op.floor) +ceildiv = _op_wrapper(_tir_op.ceildiv) +floordiv = _op_wrapper(_tir_op.floordiv) +floormod = _op_wrapper(_tir_op.floormod) +fmod = _op_wrapper(_tir_op.fmod) +hypot = _op_wrapper(_tir_op.hypot) +if_then_else = _op_wrapper(_tir_op.if_then_else) +infinity = _op_wrapper(_tir_op.infinity) +isfinite = _op_wrapper(_tir_op.isfinite) +isinf = _op_wrapper(_tir_op.isinf) +isnan = _op_wrapper(_tir_op.isnan) +isnullptr = _op_wrapper(_tir_op.isnullptr) +ldexp = _op_wrapper(_tir_op.ldexp) +likely = _op_wrapper(_tir_op.likely) +log = _op_wrapper(_tir_op.log) +log1p = _op_wrapper(_tir_op.log1p) +log2 = _op_wrapper(_tir_op.log2) +log10 = _op_wrapper(_tir_op.log10) +lookup_param = _op_wrapper(_tir_op.lookup_param) +max_value = _op_wrapper(_tir_op.max_value) +min_value = _op_wrapper(_tir_op.min_value) +nearbyint = _op_wrapper(_tir_op.nearbyint) +nextafter = _op_wrapper(_tir_op.nextafter) +popcount = _op_wrapper(_tir_op.popcount) +power = _op_wrapper(_tir_op.power) +q_multiply_shift = _op_wrapper(_tir_op.q_multiply_shift) +ret = _op_wrapper(_tir_op.ret) +reinterpret = _dtype_forward(_tir_op.reinterpret) +round = _op_wrapper(_tir_op.round) # pylint: disable=redefined-builtin +rsqrt = _op_wrapper(_tir_op.rsqrt) +shift_left = _op_wrapper(_tir_op.shift_left) +shift_right = _op_wrapper(_tir_op.shift_right) +sigmoid = _op_wrapper(_tir_op.sigmoid) +sin = _op_wrapper(_tir_op.sin) +sinh = _op_wrapper(_tir_op.sinh) +sqrt = _op_wrapper(_tir_op.sqrt) +tan = _op_wrapper(_tir_op.tan) +tanh = _op_wrapper(_tir_op.tanh) +trunc = _op_wrapper(_tir_op.trunc) +truncdiv = _op_wrapper(_tir_op.truncdiv) +truncmod = _op_wrapper(_tir_op.truncmod) +tvm_access_ptr = _op_wrapper(_tir_op.tvm_access_ptr) +tvm_throw_last_error = _op_wrapper(_tir_op.tvm_throw_last_error) +tvm_stack_alloca = _op_wrapper(_tir_op.tvm_stack_alloca) +tvm_stack_make_shape = _op_wrapper(_tir_op.tvm_stack_make_shape) +tvm_stack_make_array = _op_wrapper(_tir_op.tvm_stack_make_array) +call_packed = _op_wrapper(_tir_op.call_packed) +call_cpacked = _op_wrapper(_tir_op.call_cpacked) +call_packed_lowered = _op_wrapper(_tir_op.call_packed_lowered) +call_cpacked_lowered = _op_wrapper(_tir_op.call_cpacked_lowered) +call_extern = _dtype_forward(_tir_op.call_extern) +call_intrin = _dtype_forward(_tir_op.call_intrin) +call_llvm_intrin = _dtype_forward(_tir_op.call_llvm_intrin) +call_llvm_pure_intrin = _dtype_forward(_tir_op.call_llvm_pure_intrin) +call_pure_extern = _dtype_forward(_tir_op.call_pure_extern) +tvm_access_ptr = _op_wrapper(_tir_op.tvm_access_ptr) +tvm_tuple = _op_wrapper(_tir_op.tvm_tuple) +tvm_struct_set = _op_wrapper(_tir_op.tvm_struct_set) +tvm_struct_get = _tir_op.tvm_struct_get +tvm_thread_allreduce = _op_wrapper(_tir_op.tvm_thread_allreduce) +tvm_load_matrix_sync = _op_wrapper(_tir_op.tvm_load_matrix_sync) +tvm_mma_sync = _op_wrapper(_tir_op.tvm_mma_sync) +tvm_bmma_sync = _op_wrapper(_tir_op.tvm_bmma_sync) +tvm_fill_fragment = _op_wrapper(_tir_op.tvm_fill_fragment) +tvm_store_matrix_sync = _op_wrapper(_tir_op.tvm_store_matrix_sync) +ptx_mma = _dtype_forward(_tir_op.ptx_mma) +ptx_mma_sp = _dtype_forward(_tir_op.ptx_mma_sp) +ptx_ldmatrix = _dtype_forward(_tir_op.ptx_ldmatrix) +ptx_cp_async = _dtype_forward(_tir_op.ptx_cp_async) +ptx_wait_group = _op_wrapper(_tir_op.ptx_wait_group) +ptx_commit_group = _op_wrapper(_tir_op.ptx_commit_group) +mma_store = _dtype_forward(_tir_op.mma_store) +mma_fill = _dtype_forward(_tir_op.mma_fill) +vectorlow = _dtype_forward(_tir_op.vectorlow) +vectorhigh = _dtype_forward(_tir_op.vectorhigh) +vectorcombine = _dtype_forward(_tir_op.vectorcombine) +assume = _op_wrapper(_tir_op.assume) +undef = _op_wrapper(_tir_op.undef) +tvm_call_packed = call_packed +tvm_call_cpacked = call_cpacked +tvm_call_packed_lowered = call_packed_lowered +tvm_call_cpacked_lowered = call_cpacked_lowered +TVMBackendAllocWorkspace = _op_wrapper(_tir_op.TVMBackendAllocWorkspace) +TVMBackendFreeWorkspace = _op_wrapper(_tir_op.TVMBackendFreeWorkspace) + + +class inline: + """Inline function for meta-programming. + + Parameters + ---------- + value: Any + The value to be inlined. + """ + + def __init__(self, value: Any) -> None: + self.value = value + + def __iter__(self): + def f(): + for i in self.value: + yield inline(i) + + return f() + + # pylint: enable=invalid-name @@ -1581,4 +1854,123 @@ def var(dtype, name="") -> Var: "handle", "void", "var", + "ptr", + "min", + "max", + "iter_var", + "comm_reducer", + "buffer_var", + "abs", + "fabs", + "acos", + "acosh", + "address_of", + "asin", + "asinh", + "atan", + "atan2", + "atanh", + "ceil", + "clz", + "copysign", + "cos", + "cosh", + "erf", + "exp", + "exp2", + "exp10", + "floor", + "ceildiv", + "floordiv", + "floormod", + "fmod", + "hypot", + "if_then_else", + "infinity", + "isfinite", + "isinf", + "isnan", + "isnullptr", + "ldexp", + "likely", + "log", + "log1p", + "log2", + "log10", + "lookup_param", + "max_value", + "min_value", + "nearbyint", + "nextafter", + "popcount", + "power", + "q_multiply_shift", + "ret", + "reinterpret", + "round", + "rsqrt", + "shift_left", + "shift_right", + "sigmoid", + "sin", + "sinh", + "sqrt", + "tan", + "tanh", + "trunc", + "truncdiv", + "truncmod", + "tvm_access_ptr", + "tvm_throw_last_error", + "tvm_stack_alloca", + "tvm_stack_make_shape", + "tvm_stack_make_array", + "call_packed", + "call_cpacked", + "call_packed_lowered", + "call_cpacked_lowered", + "call_extern", + "call_intrin", + "call_llvm_intrin", + "call_llvm_pure_intrin", + "call_pure_extern", + "tvm_access_ptr", + "tvm_tuple", + "tvm_struct_set", + "tvm_struct_get", + "tvm_thread_allreduce", + "tvm_load_matrix_sync", + "tvm_mma_sync", + "tvm_bmma_sync", + "tvm_fill_fragment", + "tvm_store_matrix_sync", + "ptx_mma", + "ptx_mma_sp", + "ptx_ldmatrix", + "ptx_cp_async", + "ptx_wait_group", + "ptx_commit_group", + "mma_store", + "mma_fill", + "vectorlow", + "vectorhigh", + "vectorcombine", + "assume", + "undef", + "tvm_call_packed", + "tvm_call_cpacked", + "tvm_call_packed_lowered", + "tvm_call_cpacked_lowered", + "TVMBackendAllocWorkspace", + "TVMBackendFreeWorkspace", + "inline", + "llvm_lookup_intrinsic_id", + "Cast", + "Let", + "Select", + "Shuffle", + "type_annotation", + "broadcast", + "ramp", + "cast", ] diff --git a/src/script/ir_builder/tir/ir.cc b/src/script/ir_builder/tir/ir.cc index 28c3d69861fa..6be6e2619fea 100644 --- a/src/script/ir_builder/tir/ir.cc +++ b/src/script/ir_builder/tir/ir.cc @@ -534,6 +534,10 @@ DeclBufferFrame DeclBuffer(Array shape, DataType dtype, String buffer_ void Evaluate(PrimExpr value) { AddToParent(tvm::tir::Evaluate(value)); } +PrimExpr Ptr(runtime::DataType dtype, String storage_scope) { + return tvm::tir::Var("", tvm::PointerType(PrimType(dtype), storage_scope)); +} + using tvm::script::ir_builder::details::Namer; TVM_STATIC_IR_FUNCTOR(Namer, vtable) @@ -632,6 +636,8 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.BufferStore").set_body_typed(BufferSt TVM_REGISTER_GLOBAL("script.ir_builder.tir.Prefetch").set_body_typed(Prefetch); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Evaluate").set_body_typed(Evaluate); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.Ptr").set_body_typed(Ptr); + TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int8").set_body_typed(Int8); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int16").set_body_typed(Int16); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32").set_body_typed(Int32); @@ -650,6 +656,11 @@ TVM_REGISTER_GLOBAL("script.ir_builder.tir.Int32x16").set_body_typed(Int32x16); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Boolean").set_body_typed(Boolean); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Handle").set_body_typed(Handle); TVM_REGISTER_GLOBAL("script.ir_builder.tir.Void").set_body_typed(Void); + +TVM_REGISTER_GLOBAL("script.ir_builder.tir.min") + .set_body_typed([](PrimExpr a, PrimExpr b) -> PrimExpr { return tvm::min(a, b); }); +TVM_REGISTER_GLOBAL("script.ir_builder.tir.max") + .set_body_typed([](PrimExpr a, PrimExpr b) -> PrimExpr { return tvm::max(a, b); }); } // namespace tir } // namespace ir_builder } // namespace script diff --git a/tests/python/unittest/test_tvmscript_ir_builder_tir.py b/tests/python/unittest/test_tvmscript_ir_builder_tir.py index 40e13a2fbe2f..dbc9b594fb87 100644 --- a/tests/python/unittest/test_tvmscript_ir_builder_tir.py +++ b/tests/python/unittest/test_tvmscript_ir_builder_tir.py @@ -476,5 +476,20 @@ def test_ir_builder_tir_decl_buffer(): assert_structural_equal(ir_actual, ir_expected, map_free_vars=True) +def test_ir_builder_tir_inline(): + with IRBuilder() as ib: + m, n = T.inline(1), T.inline(2) + a, b = T.inline([3, 4]) + T.evaluate(m.value + n.value + a.value + b.value) + # the evaluate generated by IRBuilder + eval_actual = ib.get() + + # the expected evaluate + eval_expected = tir.Evaluate(10) + + # Check if the generated ir is expected + assert_structural_equal(eval_actual, eval_expected, map_free_vars=True) + + if __name__ == "__main__": tvm.testing.main() From fd268137237d2f6fbff4aa4517449284330c3cd8 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Mon, 26 Sep 2022 13:55:06 -0500 Subject: [PATCH 248/704] [TVMScript] Infer T.match_buffer parameters for region (#12890) * [TVMScript] Infer T.match_buffer parameters for region When using `T.match_buffer` to define a view into another buffer, default shape and dtype parameters can be inferred. * Updated unit test for new behavior The test intentionally triggers a failed match based on mismatched `elem_offset`. Therefore, the test now needs to explicitly pass an `elem_offset` to trigger the failure, as this now defaults to having a `Var` for `match_buffer` calls that represent views. --- python/tvm/script/tir/special_stmt.py | 68 ++++++++++++++----- .../unittest/test_tir_lower_match_buffer.py | 4 +- .../unittest/test_tvmscript_syntax_sugar.py | 25 +++++++ 3 files changed, 79 insertions(+), 18 deletions(-) diff --git a/python/tvm/script/tir/special_stmt.py b/python/tvm/script/tir/special_stmt.py index 15502055b7fc..7cbf47441053 100644 --- a/python/tvm/script/tir/special_stmt.py +++ b/python/tvm/script/tir/special_stmt.py @@ -121,8 +121,8 @@ class MatchBuffer(SpecialStmt): def __init__(self): def match_buffer( param, - shape, - dtype="float32", + shape=None, + dtype=None, data=None, strides=None, elem_offset=None, @@ -146,28 +146,64 @@ def match_buffer( offset_factor, "offset_factor", self.context.report_error, self.node.span ) buffer_name: str = self.node.lhs[0].id.name - buffer = tvm.tir.decl_buffer( - shape, - dtype, - buffer_name, - data, - strides, - elem_offset, - scope, - align, - offset_factor, - buffer_type, - axis_separators, - span=span, - ) + if isinstance(param, tvm.tir.Var): + if shape is None: + self.context.report_error( + "Shape must be specified when binding input param", + self.node.rhs.span, + ) + + if dtype is None: + dtype = "float32" + + buffer = tvm.tir.decl_buffer( + shape, + dtype, + buffer_name, + data, + strides, + elem_offset, + scope, + align, + offset_factor, + buffer_type, + axis_separators, + span=span, + ) if param not in self.context.func_params: self.context.report_error( "Can not bind non-input param to buffer", self.node.rhs.params[0].span ) self.context.func_buffer_map[param] = buffer + elif isinstance(param, BufferSlice): buffer_region = param.as_buffer_region() + + if shape is None: + shape = [dim.extent for dim in buffer_region.region] + + if dtype is None: + dtype = buffer_region.buffer.dtype + + if elem_offset is None and offset_factor == 0: + offset_factor = 1 + + buffer = tvm.tir.decl_buffer( + shape, + dtype, + buffer_name, + data, + strides, + elem_offset, + scope, + align, + offset_factor, + buffer_type, + axis_separators, + span=span, + ) + self.context.current_block_scope().match_buffers.append( tvm.tir.MatchBufferRegion(buffer, buffer_region) ) diff --git a/tests/python/unittest/test_tir_lower_match_buffer.py b/tests/python/unittest/test_tir_lower_match_buffer.py index 93b7caf9cdde..6120cf2b673c 100644 --- a/tests/python/unittest/test_tir_lower_match_buffer.py +++ b/tests/python/unittest/test_tir_lower_match_buffer.py @@ -464,7 +464,7 @@ def fail_match_load(a: T.handle) -> None: with T.block(): T.reads(A[i, j]) T.writes([]) - sub_A = T.match_buffer(A[i, j], ()) + sub_A = T.match_buffer(A[i, j], (), elem_offset=0) T.evaluate(sub_A[()]) @@ -475,7 +475,7 @@ def fail_match_store(a: T.handle) -> None: with T.block(): T.reads([]) T.writes(A[i, j]) - sub_A = T.match_buffer(A[i, j], ()) + sub_A = T.match_buffer(A[i, j], (), elem_offset=0) sub_A[()] = 1 diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py index d955ec0a8c80..2a2f7354d7cd 100644 --- a/tests/python/unittest/test_tvmscript_syntax_sugar.py +++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py @@ -251,6 +251,31 @@ def test_match_buffer_int64(): assert_structural_equal(original, after_roundtrip, True) +def test_match_buffer_region_has_implicit_shape_dtype(): + @T.prim_func + def explicit_shape_dtype(A: T.Buffer[(16, 64), "int32"]): + with T.block(): + B = T.match_buffer(A[8:16, 32:64], shape=(8, 32), dtype="int32") + T.evaluate(0) + + @T.prim_func + def implicit_shape_dtype(A: T.Buffer[(16, 64), "int32"]): + with T.block(): + B = T.match_buffer(A[8:16, 32:64]) + T.evaluate(0) + + assert_structural_equal(explicit_shape_dtype, implicit_shape_dtype) + + +def test_match_buffer_input_requires_shape_arg(): + with pytest.raises(tvm.error.DiagnosticError): + + @T.prim_func + def func(a: T.handle): + A = T.match_buffer(a, dtype="int32") + T.evaluate(0) + + def test_letstmt_bufferload_without_type_annotation(): # Variable assignment of PrimExpr types uses the dtype of the # PrimExpr to determine the variable's dtype. Parsing of From e1f3f90588aa2d9bb71e0ca8ebc5baab865e054d Mon Sep 17 00:00:00 2001 From: Tasmia Rahman <89925728+trahman-quic@users.noreply.github.com> Date: Mon, 26 Sep 2022 15:37:47 -0500 Subject: [PATCH 249/704] [TOPI][Hexagon] Implement quantize op for hexagon (#12820) * [TOPI][Hexagon] Implement quantize op for hexagon * Fix lint issue --- python/tvm/topi/hexagon/qnn/__init__.py | 2 + python/tvm/topi/hexagon/qnn/quantize.py | 80 ++++++++++++ python/tvm/topi/hexagon/utils.py | 5 + .../contrib/test_hexagon/infrastructure.py | 4 +- .../test_hexagon/topi/test_quantize.py | 121 ++++++++++++++++++ 5 files changed, 210 insertions(+), 2 deletions(-) create mode 100755 python/tvm/topi/hexagon/qnn/quantize.py create mode 100755 tests/python/contrib/test_hexagon/topi/test_quantize.py diff --git a/python/tvm/topi/hexagon/qnn/__init__.py b/python/tvm/topi/hexagon/qnn/__init__.py index ef9c025ba5b2..2616b9315a9b 100644 --- a/python/tvm/topi/hexagon/qnn/__init__.py +++ b/python/tvm/topi/hexagon/qnn/__init__.py @@ -23,3 +23,5 @@ dequantize_compute, dequantize_schedule, ) + +from .quantize import quantize_compute, tir_quantize_schedule diff --git a/python/tvm/topi/hexagon/qnn/quantize.py b/python/tvm/topi/hexagon/qnn/quantize.py new file mode 100755 index 000000000000..ff03aac0a862 --- /dev/null +++ b/python/tvm/topi/hexagon/qnn/quantize.py @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name +"""Compute and schedule for hexagon quantize +Please note the following assumptions made by the implementation: +1) The input and output data will be multiple of crouton layout +2) And the supported layout is NHWC +3) The input layout will be nhwc-4h2w32c2w-2d and + output layout will be nhwc-8h8w32c-2d""" + + +from tvm import te +from tvm import tir +from ..utils import get_layout_transform_fn, saturate + + +def quantize_compute(tensor_A: te.Tensor, scale: float, zero_point: int, dtype: str): + """Compute for quantize""" + scale_recip = 1 / scale + + return te.compute( + tensor_A.shape, + lambda n, h, w, c: saturate( + ((tensor_A[n, h, w, c] * scale_recip).astype("int32") + zero_point), + dtype, + ).astype(dtype), + name="quantize", + ) + + +def tir_quantize_schedule( + out_M: te.Tensor, + tensor_A: te.Tensor, + input_layout: str, + output_layout: str, +): + """Schedule for output layout nhwc-8h8w32c-2d""" + func = te.create_prim_func([tensor_A, out_M]) + + s = tir.Schedule(func) + + block = s.get_block("quantize") + + input_transformed_layout = get_layout_transform_fn(input_layout) + s.transform_layout(block, buffer=tensor_A.name, index_map=input_transformed_layout) + + output_transformed_layout = get_layout_transform_fn(output_layout) + s.transform_layout(block, buffer=out_M.name, index_map=output_transformed_layout) + + # Fixed chunk size is 2048 byte + # For uint8 the layout for fixed chunk is 8x8x32 + # where each element is 1 bytes + # Split and reorder is done to iterate over the fixed chunk + # Channel is split by a factor of 32 + # Width is split by a factor of 8 + # Height is split by a factor of 8 + n, h, w, c = s.get_loops(block) + + h_o, h_i = s.split(h, [None, 8]) + w_o, w_i = s.split(w, [None, 8]) + c_o, c_i = s.split(c, [None, 32]) + wio, wii = s.split(w_i, [None, 4]) + + s.reorder(n, h_o, w_o, c_o, h_i, wio, wii, c_i) + + return s diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py index 9939e5b6fbb7..dab9aa3f74ab 100644 --- a/python/tvm/topi/hexagon/utils.py +++ b/python/tvm/topi/hexagon/utils.py @@ -294,3 +294,8 @@ def within_range(val, dtype): fixed_point_value = int(round(flp * scale_f[0])) return fixed_point_value, exp_scale_factor + + +def saturate(x: te.Tensor, dtype: str): + """Saturate value for the specified data type""" + return te.max(te.min_value(dtype), te.min(x, te.max_value(dtype))) diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py index 71960b649ea2..1058e1dd8117 100644 --- a/tests/python/contrib/test_hexagon/infrastructure.py +++ b/tests/python/contrib/test_hexagon/infrastructure.py @@ -334,8 +334,8 @@ def quantize_np(arr_np: numpy.ndarray, dtype: str): qmax = 255 qmin = 0 elif dtype == "int8": - qmax = 128 - qmin = -127 + qmax = 127 + qmin = -128 else: raise RuntimeError(f"Unsupported quantized data type '{dtype}'") fmin = numpy.amin(arr_np) diff --git a/tests/python/contrib/test_hexagon/topi/test_quantize.py b/tests/python/contrib/test_hexagon/topi/test_quantize.py new file mode 100755 index 000000000000..2c1718d29465 --- /dev/null +++ b/tests/python/contrib/test_hexagon/topi/test_quantize.py @@ -0,0 +1,121 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest +import numpy as np + +import tvm +from tvm import te +import tvm.topi.hexagon.qnn as s1 +from ..infrastructure import allocate_hexagon_array, transform_numpy, quantize_np + + +@tvm.testing.fixture +def expected_output_np(input_np, output_dtype): + global scale, zero_point + quant_np, scale, zero_point = quantize_np(input_np, output_dtype) + return quant_np + + +@tvm.testing.fixture +def input_np(input_shape, input_dtype): + return np.random.random(input_shape).astype(input_dtype) + + +@tvm.testing.fixture +def transformed_input_np(input_np, input_crouton_layout): + return transform_numpy(input_np, "nhwc", input_crouton_layout) + + +@tvm.testing.fixture +def transformed_expected_output_np(expected_output_np, output_layout): + return transform_numpy(expected_output_np, "nhwc", output_layout) + + +class TestQuantize: + input_crouton_layout, output_layout, input_dtype = tvm.testing.parameters( + ("nhwc-4h2w32c2w-2d", "nhwc-8h8w32c-2d", "float32"), + ) + + output_dtype = tvm.testing.parameter("uint8", "int8") + + input_shape = tvm.testing.parameter( + (1, 8, 8, 32), (1, 16, 16, 32), (1, 16, 16, 128), (1, 64, 64, 64) + ) + + @tvm.testing.requires_hexagon + def test_quantize( + self, + input_dtype, + output_dtype, + input_np, + transformed_input_np, + input_shape, + expected_output_np, + transformed_expected_output_np, + input_crouton_layout, + output_layout, + hexagon_session, + ): + target_hexagon = tvm.target.hexagon("v69") + A = te.placeholder(input_shape, name="A", dtype=input_dtype) + + M = s1.quantize_compute(A, scale, zero_point, output_dtype) + + tir_schedule = s1.tir_quantize_schedule(M, A, input_crouton_layout, output_layout) + + sch = tir_schedule.mod + + input_axis_separator = [4] + output_axis_separator = [4] + + with tvm.transform.PassContext(opt_level=3): + func = tvm.build( + sch, + [A, M], + tvm.target.Target(target_hexagon, host=target_hexagon), + name="quantize", + ) + + A_data_nd = allocate_hexagon_array( + hexagon_session.device, + data=transformed_input_np, + dtype=input_dtype, + axis_separators=input_axis_separator, + mem_scope="global.vtcm", + ) + + M_data_nd = allocate_hexagon_array( + hexagon_session.device, + tensor_shape=transformed_expected_output_np.shape, + dtype=output_dtype, + axis_separators=output_axis_separator, + mem_scope="global.vtcm", + ) + + mod = hexagon_session.load_module(func) + mod(A_data_nd, M_data_nd) + + b, h, w, c = expected_output_np.shape + + # convert nd to np and reshape to fixed chunk size layout + M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32]) + + np.testing.assert_allclose(transformed_expected_output_np, M_data_np, atol=1) + + +if __name__ == "__main__": + tvm.testing.main() From f25a702a1fb2aef6ded6fbb0384720dadacbd8ef Mon Sep 17 00:00:00 2001 From: Tasmia Rahman <89925728+trahman-quic@users.noreply.github.com> Date: Mon, 26 Sep 2022 15:40:03 -0500 Subject: [PATCH 250/704] [TOPI][Hexagon] Add schedule and test for maxpool uint8 layout (#12826) * [TOPI][Hexagon] Add schedule and test for maxpool uint8 layout for hexagon * Fix lint issue --- .../tvm/topi/hexagon/slice_ops/max_pool2d.py | 55 +++++---- .../topi/test_max_pool2d_slice.py | 105 +++++++++++------- 2 files changed, 100 insertions(+), 60 deletions(-) diff --git a/python/tvm/topi/hexagon/slice_ops/max_pool2d.py b/python/tvm/topi/hexagon/slice_ops/max_pool2d.py index 4bf958c11694..d56879e45b84 100644 --- a/python/tvm/topi/hexagon/slice_ops/max_pool2d.py +++ b/python/tvm/topi/hexagon/slice_ops/max_pool2d.py @@ -73,8 +73,10 @@ def max_pool2d_compute(A, out_shape, kernel, stride, dilation): return Max -def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str): - """Schedule for input and output layout nhwc-8h2w32c2w""" +def STIR_schedule_nhwc_8h2w32c2w_nhwc_8h8w32c( + outs: te.Tensor, ins: te.Tensor, output_layout: str, input_layout: str +): + """Schedule for input and output layout nhwc-8h2w32c2w and nhwc-8h8w32c""" func = te.create_prim_func([ins, outs]) s = tir.Schedule(func) @@ -93,10 +95,14 @@ def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: st Max = s.get_block("max") - input_transform_fn = get_layout_transform_fn(input_layout) - output_transform_fn = get_layout_transform_fn(output_layout) + if input_layout in ( + "nhwc-8h2w32c2w-2d", + "nhwc-8h8w32c-2d", + ): + input_transform_fn = get_layout_transform_fn(input_layout) + s.transform_layout(Max, ("read", 0), input_transform_fn) - s.transform_layout(Max, ("read", 0), input_transform_fn) + output_transform_fn = get_layout_transform_fn(output_layout) s.transform_layout(Max, ("write", 0), output_transform_fn) # pylint: disable=line-too-long @@ -120,13 +126,21 @@ def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: st rw, ) = s.get_loops(Max) - # Restructure the loops from NHWC to nhwc_8h2w32c2w, with loops for 'max's reduction + # Restructure the loops from NHWC to nhwc_8h2w32c2w or nhwc_8h8w32c, with loops for 'max's reduction # axes at the very end. - ho, hi = s.split(h, [None, 8]) - wo, wi = s.split(w, [None, 4]) - wio, wii = s.split(wi, [None, 2]) - co, ci = s.split(c, [None, 32]) - s.reorder(n, ho, wo, co, hi, wio, ci, wii, rh, rw) + # nhwc_8h2w32c2w layout is for float16 and nhwc-8h8w32c-2d layout is for uint8/int8 + if output_layout == "nhwc-8h2w32c2w-2d": + ho, hi = s.split(h, [None, 8]) + wo, wi = s.split(w, [None, 4]) + wio, wii = s.split(wi, [None, 2]) + co, ci = s.split(c, [None, 32]) + s.reorder(n, ho, wo, co, hi, wio, ci, wii, rh, rw) + elif output_layout == "nhwc-8h8w32c-2d": + ho, hi = s.split(h, [None, 8]) + wo, wi = s.split(w, [None, 8]) + co, ci = s.split(c, [None, 32]) + + s.reorder(n, ho, wo, co, hi, wi, ci, rh, rw) # TODO: Enable vectorization. # Hexagon v69's HVX units support SIMD operations on 64-element float16 vectors. @@ -154,10 +168,10 @@ def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: st return s -def STIR_schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str): - """Schedule for output layout: n11c-1024c, input layout: nhwc-8h2w32c2w""" +def STIR_schedule_n11c(outs, ins, output_layout: str, input_layout: str): + """Schedule for output layout: n11c-1024c, n11c-2048c-2d;""" - # NOTE: This function is a variation of the STIR_schedule_nhwc_8h2w32c2w + # NOTE: This function is a variation of the STIR_schedule_maxpool2d # functions. Most of that function's code comments apply to this function # as well, but are ommited for brevity. @@ -181,7 +195,10 @@ def STIR_schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str): rh, rw, ) = s.get_loops(Max) - co, ci = s.split(c, [None, 1024]) + if output_layout == "n11c-1024c-2d": + co, ci = s.split(c, [None, 1024]) + else: + co, ci = s.split(c, [None, 2048]) # s.vectorize(ci) return s @@ -189,8 +206,8 @@ def STIR_schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str): def max_pool2d_STIR_schedule(outs, ins, output_layout: str, input_layout: str): """STIR based schedule""" - if output_layout == "nhwc-8h2w32c2w-2d": - return STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout) - if output_layout == "n11c-1024c-2d": - return STIR_schedule_n11c_1024c(outs, ins, output_layout, input_layout) + if output_layout == "nhwc-8h2w32c2w-2d" or "nhwc-8h8w32c-2d": + return STIR_schedule_nhwc_8h2w32c2w_nhwc_8h8w32c(outs, ins, output_layout, input_layout) + if output_layout == "n11c-1024c-2d" or "n11c-2048c-2d": + return STIR_schedule_n11c(outs, ins, output_layout, input_layout) raise RuntimeError(f"Unexpected layout '{output_layout}'") diff --git a/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py index f827f025af17..de60ffc6df4d 100644 --- a/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py +++ b/tests/python/contrib/test_hexagon/topi/test_max_pool2d_slice.py @@ -50,6 +50,31 @@ def transformed_input_np_padded(input_np_padded, input_layout): return transform_numpy(input_np_padded, "nhwc", input_layout) +(input_layout, dtype) = tvm.testing.parameters( + ("nhwc-8h2w32c2w-2d", "float16"), + ("nhwc-8h8w32c-2d", "uint8"), +) + + +@tvm.testing.fixture +def output_layout(output_shape, dtype): + o_b, o_h, o_w, o_c = output_shape + if dtype == "float16": + if o_h == 1 and o_w == 1: + return "n11c-1024c-2d" + else: + assert o_h % 8 == 0 and o_w % 4 == 0, "Invalid output shape" + return "nhwc-8h2w32c2w-2d" + elif dtype == "int8" or "uint8": + if o_h == 1 and o_w == 1: + return "n11c-2048c-2d" + else: + assert o_h % 8 == 0 and o_w % 8 == 0, "Invalid output shape" + return "nhwc-8h8w32c-2d" + else: + raise RuntimeError(f"Unsupported data type '{dtype}'") + + class TestmaxPool2dSlice: _param_descs = [ "out_shape", # output_shape @@ -59,8 +84,6 @@ class TestmaxPool2dSlice: "pad", # padding "ceil", # ceil_mode "cnt_padded", # count_include_pad - "out_layout", # output_layout - None, # dtype None, # input_tensor_populator ] @@ -73,8 +96,6 @@ class TestmaxPool2dSlice: [0, 0, 0, 0], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), ( @@ -85,8 +106,6 @@ class TestmaxPool2dSlice: [0, 0, 0, 0], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), ( @@ -97,8 +116,6 @@ class TestmaxPool2dSlice: [0, 0, 0, 0], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), # Test non-one stride and dilation @@ -110,8 +127,6 @@ class TestmaxPool2dSlice: [0, 0, 0, 0], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), ( @@ -122,8 +137,6 @@ class TestmaxPool2dSlice: [0, 0, 0, 0], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), ( @@ -134,8 +147,6 @@ class TestmaxPool2dSlice: [0, 0, 0, 0], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), # Test non-zero padding @@ -147,8 +158,6 @@ class TestmaxPool2dSlice: [1, 1, 1, 1], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), ( @@ -159,8 +168,6 @@ class TestmaxPool2dSlice: [1, 2, 3, 4], False, True, - "nhwc-8h2w32c2w-2d", - "float16", TensorContentRandom(), ), # Test n11c-1024c-2d layout which will require input and output to have different layout @@ -172,8 +179,6 @@ class TestmaxPool2dSlice: [0, 0, 0, 0], False, True, - "n11c-1024c-2d", - "float16", TensorContentRandom(), ), ( @@ -184,8 +189,6 @@ class TestmaxPool2dSlice: [0, 0, 0, 0], False, True, - "n11c-1024c-2d", - "float16", TensorContentRandom(), ), ( @@ -196,8 +199,6 @@ class TestmaxPool2dSlice: [0, 0, 0, 0], False, True, - "n11c-1024c-2d", - "float16", TensorContentRandom(), ), ( @@ -208,19 +209,14 @@ class TestmaxPool2dSlice: [0, 0, 0, 0], False, True, - "n11c-1024c-2d", - "float16", TensorContentRandom(), ), ] _param_ids = get_multitest_ids(_multitest_params, _param_descs) - input_layout = tvm.testing.parameter( - "nhwc-8h2w32c2w-2d", - ) - - # NOTE: input_layout is always assumed to be "nhwc-8h2w32c2w-2d" + # NOTE: input_layout is always assumed to be "nhwc-8h2w32c2w-2d" for float16 + # and "nhwc-8h8w32c-2d" for uint8 ( output_shape, kernel, @@ -229,8 +225,6 @@ class TestmaxPool2dSlice: padding, ceil_mode, count_include_pad, - output_layout, - dtype, input_tensor_populator, ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids) @@ -283,15 +277,32 @@ def input_shape(self, output_shape, kernel, padding, stride, dilation, output_la return [o_b, in_h, in_w, o_c] @tvm.testing.fixture - def input_shape_padded(self, input_shape, padding, output_layout): + def input_shape_padded(self, dtype, input_shape, padding, output_layout): # Input shape is adjusted to account for 'padding'. Also, due to the physical # layout of the buffer, height and width are adjusted so that they are a # multiple of 8 and 4 respectively. - # NOTE: Input layout is always assumed to be nhwc-8h2w32c2w-2d. + # NOTE: For float16, the input layout is always assumed to be nhwc-8h2w32c2w-2d and + # for int8/uint8, it's nhwc-8h8w32c-2d. + # For both nhwc-8h2w32c2w-2d and nhwc-8h8w32c-2d, the height should be a multiple + # of 8. However, the width should be a multiple of 4 for the first case and 8 for + # the second case. + + height_mult = 8 + if dtype == "float16": + width_mult = 4 # input layout : nhwc-8h2w32c2w-2d + elif dtype in ("uint8", "int8"): + width_mult = 8 # input layout : nhwc-8h8w32c-2d + else: + raise RuntimeError(f"Unsupport dtype '{dtype}'") + pad_before_h, pad_before_w = padding[:2] pad_after_h, pad_after_w = padding[2:] - padded_input_height = ((input_shape[1] + pad_before_h + pad_after_h + 7) // 8) * 8 - padded_input_width = ((input_shape[2] + pad_before_w + pad_after_w + 3) // 4) * 4 + padded_input_height = ( + (input_shape[1] + pad_before_h + pad_after_h + height_mult - 1) // height_mult + ) * height_mult + padded_input_width = ( + (input_shape[2] + pad_before_w + pad_after_w + width_mult - 1) // width_mult + ) * width_mult return [input_shape[0], padded_input_height, padded_input_width, input_shape[3]] @tvm.testing.fixture @@ -340,9 +351,12 @@ def test_max_pool2d_slice( sch = tir_schedule.mod input_axis_separator = [4] - if output_layout == "nhwc-8h2w32c2w-2d": - output_axis_separator = [4] - elif output_layout == "n11c-1024c-2d": + if output_layout in ( + "nhwc-8h2w32c2w-2d", + "nhwc-8h8w32c-2d", + "n11c-1024c-2d", + "n11c-2048c-2d", + ): output_axis_separator = [4] else: raise RuntimeError(f"Unexpected layout '{output_layout}'") @@ -374,12 +388,21 @@ def test_max_pool2d_slice( b, h, w, c = output_shape if output_layout == "nhwc-8h2w32c2w-2d": output_np = output_arr.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2]) + elif output_layout == "nhwc-8h8w32c-2d": + output_np = output_arr.numpy().reshape([b, h // 8, w // 8, c // 32, 8, 8, 32]) + elif output_layout == "n11c-2048c-2d": + output_np = output_arr.numpy().reshape([b, 1, 1, c // 2048, 2048]) elif output_layout == "n11c-1024c-2d": output_np = output_arr.numpy().reshape([b, 1, 1, c // 1024, 1024]) else: raise RuntimeError(f"Unexpected layout '{output_layout}'") - np.testing.assert_allclose(output_np, transformed_expected_output_np, rtol=1e-3, atol=1e-3) + if dtype == "float16": + np.testing.assert_allclose( + output_np, transformed_expected_output_np, rtol=1e-3, atol=1e-3 + ) + elif dtype == "uint8": + np.testing.assert_allclose(output_np, transformed_expected_output_np, atol=1) if __name__ == "__main__": From d4fb957ae1caf34604f03d9348ee9b3d3acb4709 Mon Sep 17 00:00:00 2001 From: Mehrdad Hessar Date: Mon, 26 Sep 2022 15:14:10 -0700 Subject: [PATCH 251/704] [microTVM][ARM] Improve dense DSP micro kernel (#12908) Fix micro kernel --- python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py index ffc48eaabd59..f1c0e3ea8d6d 100644 --- a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py @@ -207,7 +207,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): int16_t bb_pad[{bb_pad_size}]; int32_t retcode = 0; - if ( {M} < 16 || {N} < 16 ) {{ + if ( {M} < 2 && {N} < 2 ) {{ retcode = gemm_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); goto out; }} @@ -313,7 +313,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): int16_t bb_pad[{bb_pad_size}]; int32_t retcode = 0; - if ( {M} < 16 || {N} < 16 ) {{ + if ( {M} < 2 && {N} < 2 ) {{ retcode = gemm_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); goto out; }} @@ -393,7 +393,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): int A_stride, int B_stride, int C_stride) {{ int32_t retcode = 0; - if ( {M} < 2 || {N} < 2 ) {{ + if ( {M} < 2 && {N} < 2 ) {{ retcode = gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); goto out; }} @@ -471,7 +471,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): int A_stride, int B_stride, int C_stride) {{ int32_t retcode = 0; - if ( {M} < 2 || {N} < 2 ) {{ + if ( {M} < 2 && {N} < 2 ) {{ retcode = gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); goto out; }} From 830ebc4ec8d588bc84c283c45b22dbee1340b95d Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Mon, 26 Sep 2022 17:30:59 -0500 Subject: [PATCH 252/704] [TIR] Refactor IndexMap::Inverse in terms of NonSurjectiveInverse (#12904) The two implementations were largely identical, and had implementations that drifted apart, resulting in bugs such as https://github.com/apache/tvm/issues/12852. This commit removes this duplication by writing `Inverse` in terms of `NonSurjectiveInverse`. The merged version of `NonSurjectiveInverse` contains bugfix https://github.com/apache/tvm/pull/11841, that were previously present only in `Inverse`. --- include/tvm/tir/index_map.h | 16 +++----- src/tir/ir/index_map.cc | 74 +++++++++++-------------------------- 2 files changed, 28 insertions(+), 62 deletions(-) diff --git a/include/tvm/tir/index_map.h b/include/tvm/tir/index_map.h index 8a176cb3cee8..e1b323462cda 100644 --- a/include/tvm/tir/index_map.h +++ b/include/tvm/tir/index_map.h @@ -73,10 +73,10 @@ class IndexMapNode : public Object { /*! * \brief The inverse index map. * - * When this is defined, IndexMap::Inverse will return the pre-defined inverse index map. - * Otherwise, the inverse index map will be computed on the fly. - * It is the user's responsibility to ensure the correctness of the pre-defined inverse index - * map. + * When this is defined, IndexMap::Inverse will return the + * pre-defined inverse index map. Otherwise, the inverse index map + * will be computed on the fly. It is the user's responsibility to + * ensure the correctness of the pre-defined inverse index map. * * \note ObjectRef is used here instead of IndexMap to avoid circular reference. */ @@ -190,12 +190,8 @@ class IndexMap : public ObjectRef { * The range of the input indices is required in order to ensure * that the transformation is bijective over the input domain. * - * TODO(Lunderberg): Look into allowing non-bijective - * transformations. If injective, the inverse mapping could still - * be generated with some predicate (see NonSurjectiveInverse). If - * non-injective, could simplify the implementation of other - * optimizations (e.g. double buffering as a map `lambda *indices: - * [buffer_loop%2, *indices]`). + * If the user has supplied an `inverse_index_map`, that map is + * assumed to be correct and bijective, and is returned. */ IndexMap Inverse(Array initial_ranges) const; diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc index 2ffc5079246b..2c5349ab9941 100644 --- a/src/tir/ir/index_map.cc +++ b/src/tir/ir/index_map.cc @@ -54,6 +54,14 @@ IndexMap IndexMap::FromFunc(int ndim, runtime::TypedPackedFunc(A } std::pair IndexMap::NonSurjectiveInverse(Array initial_ranges) const { + if ((*this)->inverse_index_map.defined()) { + // return the pre-defined inverse index map if exists. In this + // case, the user-defined inverse is assumed to be correct and + // bijective. + PrimExpr padding_predicate = Bool(false); + return {Downcast((*this)->inverse_index_map.value()), padding_predicate}; + } + // Dummy variables to represent the inverse's inputs. Array output_vars; for (size_t i = 0; i < (*this)->final_indices.size(); i++) { @@ -92,8 +100,15 @@ std::pair IndexMap::NonSurjectiveInverse(Array initia // Unpack the map to an array, maintaining the same parameter order. Array inverse_exprs; - for (const auto& index : (*this)->initial_indices) { - inverse_exprs.push_back(analyzer.Simplify(inverse_exprs_map.at(index))); + for (int i = 0, n = (*this)->initial_indices.size(); i < n; ++i) { + Var index = (*this)->initial_indices[i]; + PrimExpr expr; + if (is_one(initial_ranges[i]->extent) && !inverse_exprs_map.count(index)) { + expr = initial_ranges[i]->min; + } else { + expr = inverse_exprs_map.at(index); + } + inverse_exprs.push_back(analyzer.Simplify(expr)); } PrimExpr padding_predicate = padded_iter_map->padding_predicate; @@ -117,57 +132,12 @@ std::pair IndexMap::NonSurjectiveInverse(Array initia } IndexMap IndexMap::Inverse(Array initial_ranges) const { - if ((*this)->inverse_index_map.defined()) { - // return the pre-defined inverse index map if exists. - return Downcast((*this)->inverse_index_map.value()); - } - // Dummy variables to represent the inverse's inputs. - Array output_vars; - for (size_t i = 0; i < (*this)->final_indices.size(); i++) { - PrimExpr index = (*this)->final_indices[i]; - // TODO(Lunderberg): Better names for these variables. A variable - // that is passed through unmodified (`index` is an element of - // `initial_indices`) should use that input index's name. A pair - // of output indices variables split from a single input index - // should be named (X.outer,X.inner). - std::stringstream ss; - ss << "axis" << i; - Var var_index(ss.str(), index.dtype()); - output_vars.push_back(var_index); - } - - // Dummy ranges for the extent of each input. - Map input_iters; - ICHECK_EQ((*this)->initial_indices.size(), initial_ranges.size()); - for (size_t i = 0; i < initial_ranges.size(); i++) { - input_iters.Set((*this)->initial_indices[i], initial_ranges[i]); - } - - // Unpack the output indices into linear combinations of the initial - // indices. + auto [inverse, padding_predicate] = NonSurjectiveInverse(std::move(initial_ranges)); arith::Analyzer analyzer; - auto iter_map = DetectIterMap((*this)->final_indices, input_iters, /* predicate = */ 1, - /* check_level = */ arith::IterMapLevel::Bijective, &analyzer, - /* simplify_trivial_iterators = */ false); - CHECK(iter_map->indices.size()) << "Index transformation was not bijective."; - - // Determine expressions for the input variables, in terms of the - // output variables. - Map inverse_exprs_map = InverseAffineIterMap( - iter_map->indices, Array(output_vars.begin(), output_vars.end())); - - // Unpack the map to an array, maintaining the same parameter order. - Array inverse_exprs; - for (int i = 0, n = (*this)->initial_indices.size(); i < n; ++i) { - Var index = (*this)->initial_indices[i]; - if (is_one(initial_ranges[i]->extent) && !inverse_exprs_map.count(index)) { - inverse_exprs.push_back(initial_ranges[i]->min); - } else { - inverse_exprs.push_back(inverse_exprs_map.at(index)); - } - } - - return IndexMap(output_vars, inverse_exprs); + CHECK(analyzer.CanProve(!padding_predicate)) + << "Bijective inverse should not contain padding, but inverse of " << *this << " over range " + << initial_ranges << " resulted in a padding predicate of " << padding_predicate; + return inverse; } Array IndexMapNode::MapIndices(const Array& indices, From 5ddd35c37724bec8c4e89d911b31d4ecd6e41caa Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Mon, 26 Sep 2022 13:32:22 -1000 Subject: [PATCH 253/704] [Relay][TE] Add default param name if needed (#12912) #10516 used the Relay parameter name when lowering to TE. However, this creates an issue when the parameter name is empty. This is legal in Relay, but results in errors during code generation. For example, this is the generated CUDA kernel for bias add: ``` extern "C" __global__ void __launch_bounds__(1024) fused_raf_op_tvm_add_kernel0( float* __restrict__ T_add, float* __restrict__ , /* Name is missing and it results in compile errors. */ float* __restrict__ _1) { T_add[((((int)blockIdx.x) * 1024) + ((int)threadIdx.x))] = ([((((int)blockIdx.x) * 1024) + ((int)threadIdx.x))] + _1[((((((int)blockIdx.x) * 16) + (((int)threadIdx.x) >> 6)) % 54) / 9)]); } ``` This PR adds "placeholder" back as a default to make sure no empty string will be passed when lowering to TE. --- src/relay/backend/te_compiler_cache.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc index 17eac443ffe3..6f55402baded 100644 --- a/src/relay/backend/te_compiler_cache.cc +++ b/src/relay/backend/te_compiler_cache.cc @@ -131,8 +131,9 @@ class LowerToTECompute : public backend::MemoizedExprTranslatorparams) { Array inputs; for (const auto& ttype : FlattenTupleType(param->checked_type())) { - tvm::te::Tensor tensor = - tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype, param->vid->name_hint); + auto name_hint = param->vid->name_hint; + tvm::te::Tensor tensor = tvm::te::placeholder( + GetShape(ttype->shape), ttype->dtype, (name_hint == "") ? "placeholder" : name_hint); inputs.push_back(tensor); fn_inputs_.push_back(tensor); } From 4d5ed073250aabf1dab50001aa4c85ec505062a7 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Mon, 26 Sep 2022 16:32:57 -0700 Subject: [PATCH 254/704] [TIR] Fix GetProducer/Consumer for duplicating dep edges (#12910) * [TIR] Fix GetProducer/Consumer for duplicating dep edges * preserve result ordering --- src/tir/schedule/primitive/get_block_loop.cc | 10 +++- .../unittest/test_tir_schedule_utilities.py | 52 +++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/src/tir/schedule/primitive/get_block_loop.cc b/src/tir/schedule/primitive/get_block_loop.cc index cbdb99c6444f..ecbadce470b9 100644 --- a/src/tir/schedule/primitive/get_block_loop.cc +++ b/src/tir/schedule/primitive/get_block_loop.cc @@ -81,10 +81,13 @@ Array GetProducers(const ScheduleState& self, const StmtSRef& block_sr StmtSRef scope_root = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false); Array edges = self->GetBlockScope(scope_root)->GetDepsByDst(block_sref); Array results; + std::unordered_set result_set; results.reserve(edges.size()); for (const Dependency& edge : edges) { - if (edge->kind == DepKind::kRAW || edge->kind == DepKind::kWAW) { + if ((edge->kind == DepKind::kRAW || edge->kind == DepKind::kWAW) && + !result_set.count(edge->src)) { results.push_back(edge->src); + result_set.emplace(edge->src); } } return results; @@ -94,10 +97,13 @@ Array GetConsumers(const ScheduleState& self, const StmtSRef& block_sr StmtSRef scope_root = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false); Array edges = self->GetBlockScope(scope_root)->GetDepsBySrc(block_sref); Array results; + std::unordered_set result_set; results.reserve(edges.size()); for (const Dependency& edge : edges) { - if (edge->kind == DepKind::kRAW || edge->kind == DepKind::kWAW) { + if ((edge->kind == DepKind::kRAW || edge->kind == DepKind::kWAW) && + !result_set.count(edge->dst)) { results.push_back(edge->dst); + result_set.emplace(edge->dst); } } return results; diff --git a/tests/python/unittest/test_tir_schedule_utilities.py b/tests/python/unittest/test_tir_schedule_utilities.py index 41844a868e6b..33ef0e221563 100644 --- a/tests/python/unittest/test_tir_schedule_utilities.py +++ b/tests/python/unittest/test_tir_schedule_utilities.py @@ -124,6 +124,38 @@ def vector_add_2( B[vi] = A[vi] +@T.prim_func +def tuple_reduction(data: T.Buffer[(4, 32), "float32"], T_add: T.Buffer[(4,), "float32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + with T.block("root"): + T.reads() + T.writes() + data_red_temp_v0 = T.alloc_buffer([4], dtype="float32") + data_red_temp_v1 = T.alloc_buffer([4], dtype="float32") + for i0, i1 in T.grid(4, 32): + with T.block("data_red_temp"): + ax0, k1 = T.axis.remap("SR", [i0, i1]) + T.reads(data[ax0, k1]) + T.writes(data_red_temp_v0[ax0], data_red_temp_v1[ax0]) + with T.init(): + data_red_temp_v0[ax0] = T.float32(0) + data_red_temp_v1[ax0] = T.float32(0) + v_data_red_temp_v0: T.float32 = data_red_temp_v0[ax0] + data[ax0, k1] + v_data_red_temp_v1: T.float32 = ( + data_red_temp_v1[ax0] + data[ax0, k1] * data[ax0, k1] + ) + data_red_temp_v0[ax0] = v_data_red_temp_v0 + data_red_temp_v1[ax0] = v_data_red_temp_v1 + for i0 in range(4): + with T.block("T_add"): + (ax0,) = T.axis.remap("S", [i0]) + T.reads(data_red_temp_v0[ax0], data_red_temp_v1[ax0]) + T.writes(T_add[ax0]) + T_add[ax0] = data_red_temp_v0[ax0] + data_red_temp_v1[ax0] + + # pylint: enable=no-member,invalid-name,unused-variable use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True}) @@ -261,6 +293,16 @@ def test_get_producers(use_block_name): verify_trace_roundtrip(sch, mod=matmul_relu) +def test_get_producers_multiple_buffer_depdencies(use_block_name): + sch = tir.Schedule(mod=tuple_reduction, debug_mask="all") + block = "T_add" if use_block_name else sch.get_block("T_add") + (producer,) = sch.get_producers(block) + assert tvm.ir.structural_equal( + sch.get_sref(producer).stmt, + sch.get_sref(sch.get_block("data_red_temp")).stmt, + ) + + def test_get_consumers(use_block_name): sch = tir.Schedule(mod=matmul_relu, debug_mask="all") block = "matmul" if use_block_name else sch.get_block("matmul") @@ -272,6 +314,16 @@ def test_get_consumers(use_block_name): verify_trace_roundtrip(sch, mod=matmul_relu) +def test_get_consumers_multiple_buffer_depdencies(use_block_name): + sch = tir.Schedule(mod=tuple_reduction, debug_mask="all") + block = "data_red_temp" if use_block_name else sch.get_block("data_red_temp") + (consumer,) = sch.get_consumers(block) + assert tvm.ir.structural_equal( + sch.get_sref(consumer).stmt, + sch.get_sref(sch.get_block("T_add")).stmt, + ) + + def test_annotate_unannotate_loop(): sch = tir.Schedule(mod=matmul_relu, debug_mask="all") matmul = sch.get_block("matmul") From f64e933246ba7837f691979b5d78c0449297d4b2 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Tue, 27 Sep 2022 01:02:05 -0500 Subject: [PATCH 255/704] [LLVM] Emit fp16/fp32 builtins directly into target module (#12877) For conversions between `_Float16` and `float`, LLVM uses runtime functions `__extendhfsf2` and `__truncsfhf2`. On X86 up until version 14, LLVM used `uint16_t` for representing `_Float16`. Starting with LLVM 15, half- precision values can be passed in XMM registers (i.e. as floating-point). This happens when the compilation target has SSE2 enabled (either directly, or by enabling a feature that implies SSE2). Because the names of the conversion functions remain unchanged, it is impossible for TVM to provide them in the runtime, and have them work in both cases. To solve this issue, emit these functions directly into the target module after detecting whether or not to use floating-point ABI. To allow the linker to remove potential duplicates (or if they are unused), they are weak and reside in a separate section. --- src/runtime/builtin_fp16.cc | 3 - src/target/llvm/codegen_llvm.cc | 227 ++++++++++++++++++ src/target/llvm/codegen_llvm.h | 8 + .../unittest/test_target_codegen_llvm.py | 7 +- .../unittest/test_target_codegen_x86.py | 74 ++++-- 5 files changed, 298 insertions(+), 21 deletions(-) diff --git a/src/runtime/builtin_fp16.cc b/src/runtime/builtin_fp16.cc index 4b175fb3ff60..d229491a4c7b 100644 --- a/src/runtime/builtin_fp16.cc +++ b/src/runtime/builtin_fp16.cc @@ -48,7 +48,4 @@ TVM_DLL float __gnu_h2f_ieee(uint16_t a) { } #endif - -TVM_DLL uint16_t __truncsfhf2(float v) { return __gnu_f2h_ieee(v); } -TVM_DLL float __extendhfsf2(uint16_t v) { return __gnu_h2f_ieee(v); } } diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc index 305358d079d0..ca9d577f64f6 100644 --- a/src/target/llvm/codegen_llvm.cc +++ b/src/target/llvm/codegen_llvm.cc @@ -34,6 +34,11 @@ #else #include #endif +#if TVM_LLVM_VERSION >= 60 +#include +#else +#include +#endif #include #include #include @@ -167,6 +172,45 @@ void CodeGenLLVM::InitTarget() { LOG(WARNING) << "Set native vector bits to be 128 for " << arch_name; } } + +#if TVM_LLVM_VERSION >= 60 + bool use_float16_abi = false; +#if TVM_LLVM_VERSION >= 150 + // For conversions between _Float16 and float, LLVM uses runtime functions + // __extendhfsf2 and __truncsfhf2. On X86 up until version 14, LLVM used + // "uint16_t" for representing _Float16. Starting with LLVM 15, half-precision + // values can be passed in XMM registers (i.e. as floating-point). This happens + // when the compilation target has SSE2 enabled (either directly, or by enabling + // a feature that implies SSE2). + // Because the names of the conversion functions remain unchanged, it is impossible + // for TVM to provide them in the runtime, and have them work in both cases. + // To alleviate this issue, emit these functions directly into the target module + // after detecting whether or not to use floating-point ABI. To allow the linker + // to remove potential duplicates (or if they are unused), they are weak and + // reside in a separate section (ELF). + llvm::Triple::ArchType arch_type = tm->getTargetTriple().getArch(); + if (arch_type == llvm::Triple::x86 || arch_type == llvm::Triple::x86_64) { + // Detect if SSE2 is enabled. This determines whether float16 ABI is used. + std::stringstream os; + const char fname[] = "test_sse2"; + os << "target triple = \"" << llvm_target_->GetTargetTriple() << "\"\n" + << "define void @" << fname << "() #0 { ret void } attributes #0 = { \"target-cpu\"=\"" + << llvm_target_->GetCPU() << "\" "; + if (auto&& fs = llvm_target_->GetTargetFeatureString(); !fs.empty()) { + os << "\"target-features\"=\"" << fs << "\" "; + } + os << "}\n"; + auto mod = llvm_target_->GetInstance().ParseIR(os.str()); + auto* test_sse2 = mod->getFunction(fname); + ICHECK_NE(test_sse2, nullptr) << "Module creation error"; + use_float16_abi = tm->getSubtargetImpl(*test_sse2)->checkFeatures("+sse2"); + } +#endif // TVM_LLVM_VERSION >= 150 + + // Call this function only with LLVM >= 6.0. The code it emits uses "dso_local" + // which was introduced in LLVM 6. + EmitFloat16ConversionBuiltins(use_float16_abi); +#endif // TVM_LLVM_VERSION >= 60 } void CodeGenLLVM::AddFunction(const PrimFunc& f) { this->AddFunctionInternal(f, false); } @@ -949,6 +993,189 @@ void CodeGenLLVM::SetTargetAttributes(llvm::Function* func) { } } +void CodeGenLLVM::EmitFloat16ConversionBuiltins(bool use_float16_abi) { + // The LLVM IR for these function was obtained by compiling + // + // For integer ABI: + // __truncXfYf2__(a); + // __extendXfYf2__(a); + // For floating-point ABI: + // __truncXfYf2__(a); + // __extendXfYf2__<_Float16, uint16_t, 10, float, uint32_t, 23>(a); + + static const char trunc_body[] = // __truncsfhf2 + " %v0 = bitcast float %a0 to i32\n" + " %v1 = and i32 %v0, 2147483647\n" + " %v2 = add nsw i32 %v1, -947912704\n" + " %v3 = add nsw i32 %v1, -1199570944\n" + " %v4 = icmp ult i32 %v2, %v3\n" + " br i1 %v4, label %b1, label %b5\n" + "b1:\n" + " %v5 = lshr i32 %v0, 13\n" + " %v6 = and i32 %v5, 65535\n" + " %v7 = add nuw nsw i32 %v6, -114688\n" + " %v8 = and i32 %v0, 8191\n" + " %v9 = icmp ugt i32 %v8, 4096\n" + " br i1 %v9, label %b2, label %b3\n" + "b2:\n" + " %v10 = add nuw nsw i32 %v6, -114687\n" + " br label %b13\n" + "b3:\n" + " %v11 = icmp eq i32 %v8, 4096\n" + " br i1 %v11, label %b4, label %b13\n" + "b4:\n" + " %v12 = and i32 %v7, 65535\n" + " %v13 = and i32 %v5, 1\n" + " %v14 = add nuw nsw i32 %v12, %v13\n" + " br label %b13\n" + "b5:\n" + " %v15 = icmp ugt i32 %v1, 2139095040\n" + " br i1 %v15, label %b6, label %b7\n" + "b6:\n" + " %v16 = lshr i32 %v0, 13\n" + " %v17 = and i32 %v16, 511\n" + " %v18 = or i32 %v17, 32256\n" + " br label %b13\n" + "b7:\n" + " %v19 = icmp ugt i32 %v1, 1199570943\n" + " br i1 %v19, label %b13, label %b8\n" + "b8:\n" + " %v20 = icmp ult i32 %v1, 754974720\n" + " br i1 %v20, label %b13, label %b9\n" + "b9:\n" + " %v21 = lshr i32 %v1, 23\n" + " %v22 = sub nsw i32 113, %v21\n" + " %v23 = and i32 %v0, 8388607\n" + " %v24 = or i32 %v23, 8388608\n" + " %v25 = add nsw i32 %v21, -81\n" + " %v26 = shl i32 %v24, %v25\n" + " %v27 = icmp ne i32 %v26, 0\n" + " %v28 = lshr i32 %v24, %v22\n" + " %v29 = zext i1 %v27 to i32\n" + " %v30 = lshr i32 %v28, 13\n" + " %v31 = and i32 %v28, 8191\n" + " %v32 = or i32 %v31, %v29\n" + " %v33 = icmp ugt i32 %v32, 4096\n" + " br i1 %v33, label %b10, label %b11\n" + "b10:\n" + " %v34 = add nuw nsw i32 %v30, 1\n" + " br label %b13\n" + "b11:\n" + " %v35 = icmp eq i32 %v32, 4096\n" + " br i1 %v35, label %b12, label %b13\n" + "b12:\n" + " %v36 = and i32 %v30, 1\n" + " %v37 = add nuw nsw i32 %v36, %v30\n" + " br label %b13\n" + "b13:\n" + " %v38 = phi i32 [ %v18, %b6 ], [ %v10, %b2 ], [ %v14, %b4 ], [ %v7, %b3 ],\n" + " [ 31744, %b7 ], [ 0, %b8 ], [ %v34, %b10 ], [ %v37, %b12 ],\n" + " [ %v30, %b11 ]\n" + " %v39 = lshr i32 %v0, 16\n" + " %v40 = and i32 %v39, 32768\n" + " %v41 = or i32 %v38, %v40\n" + " %vlast = trunc i32 %v41 to i16\n"; + + static const char extend_body[] = // __extendhfsf2 + " %v1 = and i16 %vinp, 32767\n" + " %v2 = zext i16 %v1 to i32\n" + " %v3 = add nsw i16 %v1, -1024\n" + " %v4 = icmp ult i16 %v3, 30720\n" + " br i1 %v4, label %b1, label %b2\n" + "b1:\n" + " %v5 = shl nuw nsw i32 %v2, 13\n" + " %v6 = add nuw nsw i32 %v5, 939524096\n" + " br label %b6\n" + "b2:\n" + " %v7 = icmp ugt i16 %v1, 31743\n" + " br i1 %v7, label %b3, label %b4\n" + "b3:\n" + " %v8 = shl nuw nsw i32 %v2, 13\n" + " %v9 = or i32 %v8, 2139095040\n" + " br label %b6\n" + "b4:\n" + " %v10 = icmp eq i16 %v1, 0\n" + " br i1 %v10, label %b6, label %b5\n" + "b5:\n" + " %v11 = icmp ult i16 %v1, 256\n" + " %v12 = lshr i32 %v2, 8\n" + " %v13 = select i1 %v11, i32 %v2, i32 %v12\n" + " %v14 = select i1 %v11, i32 32, i32 24\n" + " %v15 = icmp ult i32 %v13, 16\n" + " %v16 = lshr i32 %v13, 4\n" + " %v17 = add nsw i32 %v14, -4\n" + " %v18 = select i1 %v15, i32 %v13, i32 %v16\n" + " %v19 = select i1 %v15, i32 %v14, i32 %v17\n" + " %v20 = icmp ult i32 %v18, 4\n" + " %v21 = lshr i32 %v18, 2\n" + " %v22 = add nsw i32 %v19, -2\n" + " %v23 = select i1 %v20, i32 %v18, i32 %v21\n" + " %v24 = select i1 %v20, i32 %v19, i32 %v22\n" + " %v25 = icmp ult i32 %v23, 2\n" + " %v26 = sub nsw i32 0, %v23\n" + " %v27 = select i1 %v25, i32 %v26, i32 -2\n" + " %v28 = add nsw i32 %v27, %v24\n" + " %v29 = add nsw i32 %v28, -8\n" + " %v30 = shl i32 %v2, %v29\n" + " %v31 = xor i32 %v30, 8388608\n" + " %v32 = shl i32 %v28, 23\n" + " %v33 = sub i32 1124073472, %v32\n" + " %v34 = or i32 %v31, %v33\n" + " br label %b6\n" + "b6:\n" + " %v35 = phi i32 [ %v6, %b1 ], [ %v9, %b3 ], [ %v34, %b5 ], [ 0, %b4 ]\n" + " %v36 = and i16 %vinp, -32768\n" + " %v37 = zext i16 %v36 to i32\n" + " %v38 = shl nuw i32 %v37, 16\n" + " %v39 = or i32 %v35, %v38\n" + " %v40 = bitcast i32 %v39 to float\n" + " ret float %v40\n" + "}\n"; + + std::string short_type = use_float16_abi ? "half" : "i16"; + + std::string short_cast_in, short_cast_out; + if (use_float16_abi) { + short_cast_in = " %vinp = bitcast half %a0 to i16\n"; + short_cast_out = " %vres = bitcast i16 %vlast to half\n"; + } else { + // No-ops that preserve the i16 values. + short_cast_in = " %vinp = add i16 %a0, 0\n"; + short_cast_out = " %vres = add i16 %vlast, 0\n"; + } + + llvm::Triple triple(llvm_target_->GetTargetTriple()); + + static const char elf_section_name[] = ".text.tvm.fp16.conv"; + std::string section = triple.getObjectFormat() == llvm::Triple::ELF + ? std::string("section \"") + elf_section_name + "\" " + : ""; + + std::string trunc_header = "define weak dso_local " + short_type + + " @__truncsfhf2(float %a0) local_unnamed_addr #0 " + section + + "{\nb0:\n"; + std::string trunc_return = " ret " + short_type + " %vres\n}\n"; + + std::string extend_header = "define weak dso_local float @__extendhfsf2(" + short_type + + " %a0) local_unnamed_addr #0 " + section + "{\nb0:\n"; + + // truncate = trunc_header + trunc_body + short_cast_out + trunc_return + // extend = extend_header + short_cast_in + extend_body + + std::string attributes = "attributes #0 = { nounwind readnone \"target-cpu\"=\"" + + llvm_target_->GetCPU() + "\" \"target-features\"=\"" + + llvm_target_->GetTargetFeatureString() + "\" }\n"; + + auto data_layout = llvm_target_->GetOrCreateTargetMachine()->createDataLayout(); + std::string module_ir = "target triple = \"" + llvm_target_->GetTargetTriple() + "\"\n" + + "target datalayout = \"" + data_layout.getStringRepresentation() + + "\"\n" + trunc_header + trunc_body + short_cast_out + trunc_return + + extend_header + short_cast_in + extend_body + attributes; + + auto builtins_module = llvm_target_->GetInstance().ParseIR(module_ir); + link_modules_.push_back(std::move(builtins_module)); +} + llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) { if (op->op.same_as(builtin_call_llvm_intrin_) || op->op.same_as(builtin_call_llvm_pure_intrin_)) { ICHECK_GE(op->args.size(), 2U); diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h index e6321be647aa..7a8daf2e761f 100644 --- a/src/target/llvm/codegen_llvm.h +++ b/src/target/llvm/codegen_llvm.h @@ -395,6 +395,14 @@ class CodeGenLLVM : public ExprFunctor, * \param func The function to set attributes on. */ void SetTargetAttributes(llvm::Function* func); + /*! + * \brief Emit LLVM IR for conversion functions __extendhfsf2 and __truncsfhf2 + * into the current llvm::Module. + * + * \param use_float16_abi Whether to use floating-point or integer ABI. + */ + void EmitFloat16ConversionBuiltins(bool use_float16_abi); + /*! * \brief Get the number of elements in the given vector value. * \param vec The value, must be of a vector type. diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py index c57648382827..e179d17101a3 100644 --- a/tests/python/unittest/test_target_codegen_llvm.py +++ b/tests/python/unittest/test_target_codegen_llvm.py @@ -18,11 +18,11 @@ import ctypes import json import math +import numpy as np +import pytest import re import sys -import numpy as np -import pytest import tvm import tvm.testing from tvm import te @@ -854,7 +854,8 @@ def make_call_extern(caller, callee): } mod = tvm.IRModule(functions=functions) ir_text = tvm.build(mod, None, target="llvm").get_source("ll") - matches = re.findall(r"^define[^@]*@([a-zA-Z_][a-zA-Z0-9_]*)", ir_text, re.MULTILINE) + # Skip functions whose names start with _. + matches = re.findall(r"^define[^@]*@([a-zA-Z][a-zA-Z0-9_]*)", ir_text, re.MULTILINE) assert matches == sorted(matches) diff --git a/tests/python/unittest/test_target_codegen_x86.py b/tests/python/unittest/test_target_codegen_x86.py index ec42e0a4d749..af91ed4520fd 100644 --- a/tests/python/unittest/test_target_codegen_x86.py +++ b/tests/python/unittest/test_target_codegen_x86.py @@ -14,27 +14,25 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import tvm -from tvm import te +import numpy as np +import platform +import pytest import re +import textwrap +import tvm +from tvm import te -def test_fp16_to_fp32(): - if tvm.target.codegen.llvm_version_major() < 6: - print( - "Skipping due to LLVM version being {} < 6".format( - tvm.target.codegen.llvm_version_major() - ) - ) - return +llvm_version = tvm.target.codegen.llvm_version_major() +machine = platform.machine() - import platform +if machine not in ["i386", "x86_64", "AMD64", "amd64"]: + pytest.skip(f"Requires x86_64/i386, but machine is {machine}", allow_module_level=True) - machine = platform.machine() - if machine not in ["x86_64", "i386", "AMD64"]: - print("Skipping test because the platform is: {} ".format(machine)) - return +@tvm.testing.requires_llvm +@pytest.mark.skipif(llvm_version < 6, reason=f"Requires LLVM 6+, got {llvm_version}") +def test_fp16_to_fp32(): def fp16_to_fp32(target, width, match=None, not_match=None): elements = 64 n = tvm.runtime.convert(elements) @@ -63,5 +61,51 @@ def fp16_to_fp32(target, width, match=None, not_match=None): fp16_to_fp32("llvm", 9, not_match="vcvtph2ps") +is_32bit = platform.architecture()[0] == "32bit" + + +@tvm.testing.requires_llvm +@pytest.mark.skipif(is_32bit, reason=f"Fails in CI due to architecture mismatch in JIT") +@pytest.mark.parametrize("feature_string", ["-sse2", "+sse2"]) +def test_fp16_fp32_conversions(feature_string): + relay_model = textwrap.dedent( + """ + #[version = "0.0.5"] + def @main(%inp : Tensor[(3), float32], %cst : Tensor[(3), float32]) { + %1 = cast(%inp, dtype="float16"); + %2 = cast(%cst, dtype="float16"); + %3 = add(%1, %2); + %4 = cast(%3, dtype="float32"); + %4 + } + """ + ) + + ir_mod = tvm.parser.fromtext(relay_model) + + arch = "i386" if machine == "i386" else "x86_64" + aot_factory = tvm.relay.build( + ir_mod, + params={"cst": np.array([1.0, 2.0, 3.0], dtype="float32")}, + target=f"llvm --mtriple={arch} --mattr={feature_string}", + executor=tvm.relay.backend.Executor( + "aot", {"interface-api": "packed", "unpacked-api": False} + ), + ) + + mod_name = aot_factory["list_module_names"]()[0] + executor = aot_factory[mod_name] + mod = executor(tvm.cpu(0)) + + inp = tvm.nd.array(np.array([1.1, 2.1, 3.1], dtype="float32"), device=tvm.cpu(0)) + + mod.get_function("set_input")(0, inp) + mod.get_function("run")() + out = mod.get_function("get_output")(0) + + expected = np.array([2.1, 4.1, 6.1], dtype="float32") + np.testing.assert_allclose(out.asnumpy(), expected, rtol=1e-3) + + if __name__ == "__main__": test_fp16_to_fp32() From b61f633e10b02ac3e767ad268562a4dd2c178de5 Mon Sep 17 00:00:00 2001 From: Yaoda Zhou Date: Tue, 27 Sep 2022 14:20:59 +0800 Subject: [PATCH 256/704] [TVM PyTorch Integration] optimized_torch & as_torch how-to guide (#12318) * how-to use optmized_torch * as_torch * format * one more comment * improve doc * improve code * fix text * SSR * CPU model * whitespace * improve document * small edit * retrigger ci * using_as_torch polish * using_optimized_torch * fix errors * one more author * small edit * polish as_torch * save progress * more edit * small edit Co-authored-by: juda --- .../work_with_pytorch/using_as_torch.py | 159 ++++++++++++++++++ .../using_optimized_torch.py | 149 ++++++++++++++++ python/tvm/contrib/torch/as_torch.py | 9 +- python/tvm/contrib/torch/optimize_torch.py | 4 +- 4 files changed, 316 insertions(+), 5 deletions(-) create mode 100644 gallery/how_to/work_with_pytorch/using_as_torch.py create mode 100644 gallery/how_to/work_with_pytorch/using_optimized_torch.py diff --git a/gallery/how_to/work_with_pytorch/using_as_torch.py b/gallery/how_to/work_with_pytorch/using_as_torch.py new file mode 100644 index 000000000000..e17a29e277ea --- /dev/null +++ b/gallery/how_to/work_with_pytorch/using_as_torch.py @@ -0,0 +1,159 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Wrap Your TVMScript as PyTorch Module +====================== +**Author**: +`Yaoda Zhou `_ + +This article is a tutorial on wrapping the TVMScript code as the PyTorch module. +Using the decorator `as_torch`, users can wrap TVMScript code into a PyTorch nn.Module naturally. +""" + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + +# Import PyTorch, as well as necessary libraries +import torch +import torch.nn.functional as F +import torch.utils.benchmark as benchmark + +import tvm +from tvm.contrib.torch import as_torch +from tvm.script import tir as T + +###################################################################### +# Write your own PyTorch operator by TVMScript +# ------------------------------- +# PyTorch is a very popular machine learning framework which contains +# optimized implementations of most commonly used operators. +# Nevertheless, sometimes you might want to write your own operators in PyTorch. +# In that case, the performance of such custom operators might not be satisfactory for your needs. +# +# For example, suppose that we are going to define a 1-d depthwise convolution operator. +# Assume the number of in_channel and out_channel are both 70, +# the width is 80 and the kernel size is 20, +# then the 1-d depthwise conv could be written in PyTorch in one line: + +in_channel = 70 +out_channel = 70 +width = 80 +kernel_size = 20 + + +def torch_depthwise(inputs, filters): + return F.conv1d(inputs, filters.view(out_channel, 1, kernel_size), groups=out_channel) + + +# We can run this function as: + +inputs = torch.randn(in_channel, width) +filters = torch.randn(out_channel, kernel_size) +ret_torch = torch_depthwise(inputs, filters) + + +# The `torch_depthwise` function, in a plain Python code, could be written as: + + +def vanilla_depthwise(input, weight): + ret = torch.zeros(out_channel, width - kernel_size + 1) + for j in range(out_channel): + for i in range(width - kernel_size + 1): + for k in range(kernel_size): + ret[j, i] += weight[j, k] * input[j, i + k] + return ret + + +# Then, we plan to optimize the `depthwise` function by leveraging the power of TVM. +# TVM community proposes an embedded Domain Specific Language in Python called TVMScript, +# which serves as the high-level frontend for TVM's Tensor IR. +# The depthwise 1D convolution code above can be translated to TVMScript as follows. +# We provide an `as_torch` decorator, which converts the TVMScript code to PyTorch's nn.Module automatically. + + +@as_torch +@T.prim_func +def tvm_depthwise( + A: T.Buffer((70, 80), "float32"), + B: T.Buffer((70, 20), "float32"), + C: T.Buffer((70, 61), "float32"), +) -> None: + for j, i, k in T.grid(70, 61, 20): + with T.block(): + vi, vj, vk = T.axis.remap("SSR", [i, j, k]) + with T.init(): + C[vj, vi] = T.float32(0) + C[vj, vi] += B[vj, vk] * A[vj, vi + vk] + + +# We can build the TVMScript code by calling the `tune` method in default setting. +# Without providing extra information, the model will be tuned for CPU. + +tvm_depthwise.tune() + +# We can print out the tuned TVMScript code to see how the program is transformed, as + +print(tvm_depthwise.script()) + +# We can verify that the two outputs are the same: + +ret_tvm = torch.zeros(out_channel, width - kernel_size + 1) +tvm_depthwise(inputs, filters, ret_tvm) + +testing.assert_allclose(ret_torch.cpu().numpy(), ret_tvm.cpu().numpy(), atol=1e-5, rtol=1e-5) + + +###################################################################### +# Benchmark +# ------------------------------- + +results = [] +for i in range(5): + inputs = torch.randn(out_channel, width) + filters = torch.randn(out_channel, kernel_size) + res = torch.zeros(out_channel, width - kernel_size + 1) + sub_label = f"[test {i}]" + results.append( + benchmark.Timer( + stmt="tvm_depthwise(inputs, filters, res)", + setup="from __main__ import tvm_depthwise", + globals={"inputs": inputs, "filters": filters, "res": res}, + sub_label=sub_label, + description="TVMScript", + ).blocked_autorange() + ) + results.append( + benchmark.Timer( + stmt="torch_depthwise(inputs, filters)", + setup="from __main__ import torch_depthwise", + globals={ + "inputs": inputs, + "filters": filters, + }, + sub_label=sub_label, + description="PyTorch", + ).blocked_autorange() + ) +compare = benchmark.Compare(results) +compare.print() + +# In author's environment, the average inference time of `tvm_depthwise` is 120.0 us, +# while the average inference time of `torch_depthwise` is 196.0 us (PyTorch version is 1.11.0), +# showing the speedup of around 38%. diff --git a/gallery/how_to/work_with_pytorch/using_optimized_torch.py b/gallery/how_to/work_with_pytorch/using_optimized_torch.py new file mode 100644 index 000000000000..aa68d9e68ec6 --- /dev/null +++ b/gallery/how_to/work_with_pytorch/using_optimized_torch.py @@ -0,0 +1,149 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Compile PyTorch Models +====================== +**Author**: +`Yaoda Zhou `_ + +This article is a tutorial to optimize PyTorch models by using decorator `optimize_torch`. +To follow this tutorial, PyTorch, as well as TorchVision, should be installed. +""" + +# sphinx_gallery_start_ignore +from tvm import testing + +testing.utils.install_request_hook(depth=3) +# sphinx_gallery_end_ignore + +# Import PyTorch +import torch +import torch.nn as nn +import torch.nn.functional as F + +# Import library for profiling +import torch.utils.benchmark as benchmark +from torchvision.models import resnet18 + +# Import `optimize_torch` function +from tvm.contrib.torch import optimize_torch +from tvm.meta_schedule import TuneConfig + +###################################################################### +# Define a simple module written by PyTorch +# ------------------------------ + + +class SimpleModel(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(1, 20, 5) + self.conv2 = nn.Conv2d(20, 20, 5) + + def forward(self, x): + x = F.relu(self.conv1(x)) + return F.relu(self.conv2(x)) + + +###################################################################### +# Optimize SimpleModel by TVM MetaSchedule +# ------------------------------ +# We provide the `optimize_torch` function, which has the similar usage as `torch.jit.trace`. +# The PyTorch model to optimize, along with its example input, are provided by users. +# The PyTorch module will be tuned by TVM for the target hardware. +# Without providing extra information, the model will be tuned for CPU. + +simple_model = SimpleModel() +example_input = torch.randn(20, 1, 10, 10) +model_optimized_by_tvm = optimize_torch(simple_model, example_input) + +###################################################################### +# Save/Load module +# ------------------------------ +# We can save and load our tuned module like the standard `nn.Module`. + +# Let us run our tuned module. +ret1 = model_optimized_by_tvm(example_input) + +torch.save(model_optimized_by_tvm, "model_optimized.pt") +model_loaded = torch.load("model_optimized.pt") + +# We load the module and run it again. +ret2 = model_loaded(example_input) + +# We will show 2 results: +# (1) we can safely load and save model by showing the result of model +# after save and load operations is still the same as original one; +# (2) the model we optimize returns the same result as the original PyTorch model. + +ret3 = simple_model(example_input) +testing.assert_allclose(ret1.detach().numpy(), ret2.detach().numpy(), atol=1e-5, rtol=1e-5) +testing.assert_allclose(ret1.detach().numpy(), ret3.detach().numpy(), atol=1e-5, rtol=1e-5) + +###################################################################### +# Optimize resnet18 +# ------------------------------ +# In the following, we will show that our approach is able to +# accelerate common models, such as resnet18. + +# We will tune our model for the GPU. +target_cuda = "nvidia/geforce-rtx-3070" + +# For PyTorch users, the code could be written as usual, except for +# applying "optimize_torch" function on the resnet18 model. + +resnet18_tvm = optimize_torch( + resnet18().cuda().eval(), [torch.rand(1, 3, 224, 224).cuda()], target=target_cuda +) + +# TorchScript also provides a built-in "optimize_for_inference" function to accelerate the inference. +resnet18_torch = torch.jit.optimize_for_inference(torch.jit.script(resnet18().cuda().eval())) + + +###################################################################### +# Compare the performance between two approaches. +# ------------------------------ + +results = [] +for i in range(5): + test_input = torch.rand(1, 3, 224, 224).cuda() + sub_label = f"[test {i}]" + results.append( + benchmark.Timer( + stmt="resnet18_tvm(test_input)", + setup="from __main__ import resnet18_tvm", + globals={"test_input": test_input}, + sub_label=sub_label, + description="tuning by meta", + ).blocked_autorange() + ) + results.append( + benchmark.Timer( + stmt="resnet18_torch(test_input)", + setup="from __main__ import resnet18_torch", + globals={"test_input": test_input}, + sub_label=sub_label, + description="tuning by jit", + ).blocked_autorange() + ) + +compare = benchmark.Compare(results) +compare.print() + +# In author's environment, the average inference time of `resnet18_tvm` is 620.0 us, +# while the average inference time of `resnet18_torch` is 980.0 us (PyTorch version is 1.11.0), +# showing the speedup of around 38%. diff --git a/python/tvm/contrib/torch/as_torch.py b/python/tvm/contrib/torch/as_torch.py index 3a2b4dda9ea9..a8cd895a6c5e 100644 --- a/python/tvm/contrib/torch/as_torch.py +++ b/python/tvm/contrib/torch/as_torch.py @@ -21,7 +21,7 @@ # pylint: disable=missing-class-docstring # pylint: disable=missing-function-docstring """ -as_torch: a decorator, which is used to wrap the TVMscript code to `torch.nn.module`. +as_torch: a decorator, which is used to wrap the TVMScript code to `torch.nn.module`. """ import tempfile from typing import Callable, List, Union @@ -50,7 +50,7 @@ def __init__( def tune(self, config: TuneConfig = None, target: Union[str, Target] = None): """ - Tune the TVMscript code. + Tune the TVMScript code. Parameters ---------- @@ -80,6 +80,9 @@ def tune(self, config: TuneConfig = None, target: Union[str, Target] = None): self.ir_module = sch.mod self.build(target) + def script(self): + return self.ir_module.script() + def build(self, target=None): runtime_module = tvm.build(self.ir_module, target=target) func = tvm.get_global_func("tvmtorch.save_runtime_mod") @@ -105,7 +108,7 @@ def as_torch(func: Union[tvm.ir.module.IRModule, tvm.tir.function.PrimFunc, Call Parameters ---------- func: Optional[tvm.ir.module.IRModule, tvm.tir.function.PrimFunc, Callable] - The function written by TVMscript. + The function written by TVMScript. Returns ------- diff --git a/python/tvm/contrib/torch/optimize_torch.py b/python/tvm/contrib/torch/optimize_torch.py index 282e6c5dc84f..821a3b1f71d5 100644 --- a/python/tvm/contrib/torch/optimize_torch.py +++ b/python/tvm/contrib/torch/optimize_torch.py @@ -40,7 +40,6 @@ from tvm.ir.module import IRModule from tvm.ir.transform import PassContext from tvm.meta_schedule import TuneConfig, default_config -from tvm.meta_schedule.apply_history_best import ApplyHistoryBest from tvm.meta_schedule.relay_integration import extract_task_from_relay from tvm.meta_schedule.tune import tune_extracted_tasks from tvm.meta_schedule.utils import autotvm_silencer @@ -114,12 +113,13 @@ def tune_relay_auto( ) database = tune_extracted_tasks(extracted_tasks, config, work_dir) relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend] - with target, autotvm_silencer(), ApplyHistoryBest(database): + with target, autotvm_silencer(), database: with PassContext( opt_level=3, config={ "relay.backend.use_meta_schedule": True, "relay.backend.use_meta_schedule_dispatch": target.kind.name != "cuda", + "relay.backend.tir_converter": "default", }, ): return relay_build(mod, target=target, params=params) From 7a4c10c44a9255ac2fa52ce7e3a83f718d60823f Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Tue, 27 Sep 2022 03:56:42 -0500 Subject: [PATCH 257/704] [TIR][Transform] Remove num_unpacked_args from MakePackedAPI (#12892) Other than a single unit test, there are no usages of this parameter in TVM, and it significantly complicates the logic of `MakePackedAPI`. Similar functionality can be had by using `MakeUnpackedAPI` instead. --- include/tvm/tir/transform.h | 9 +- python/tvm/tir/transform/transform.py | 11 +- src/driver/driver_api.cc | 2 +- src/tir/transforms/make_packed_api.cc | 109 ++++++------------ .../test_tir_transform_make_packed_api.py | 5 +- 5 files changed, 46 insertions(+), 90 deletions(-) diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h index a4caeee43604..6aa1aca69970 100644 --- a/include/tvm/tir/transform.h +++ b/include/tvm/tir/transform.h @@ -192,16 +192,13 @@ TVM_DLL Pass InstrumentBoundCheckers(); * - Map the values in the api_args to Var that is required by body. * - Insert assertions to check type/value of the passed arguments. * - * \param num_unpacked_args Number of arguments that - * are processed in plain form instead of packed form. - * * \note * The function signature have two cases * - * let num_packed_args = len(api_args) - num_unpacked_args; + * let num_packed_args = len(api_args); * * if num_packed_args is zero: - * f(api_arg_0, api_arg_1, .., api_arg_n) where n == len(api_args) + * f() * * if num_packed_args is not zero: * f(TVMArg* packed_args, int* packed_arg_type_ids, int num_packed_args, @@ -212,7 +209,7 @@ TVM_DLL Pass InstrumentBoundCheckers(); * * \return The pass. */ -TVM_DLL Pass MakePackedAPI(int num_unpacked_args); +TVM_DLL Pass MakePackedAPI(); /*! * \brief Transform the high-level PrimFunc to a C signature that can be used diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py index 324471c71891..3c1ca196f1b0 100644 --- a/python/tvm/tir/transform/transform.py +++ b/python/tvm/tir/transform/transform.py @@ -387,22 +387,15 @@ def LowerCustomDatatypes(): return _ffi_api.LowerCustomDatatypes() # type: ignore -def MakePackedAPI(num_unpacked_params: int = -1): +def MakePackedAPI(): """Transform the PrimFuncs in the module to a packed func API. - Parameters - ---------- - num_unpacked_params : int - Number of parameters that we hope to directly pass via normal arguments - following the PackedFunc input signature. If it is specified as -1 or it - is less than the number of arguments, the pass will packed arguments still. - Returns ------- fpass : tvm.transform.Pass The result pass """ - return _ffi_api.MakePackedAPI(num_unpacked_params) # type: ignore + return _ffi_api.MakePackedAPI() # type: ignore def MakeUnpackedAPI(): diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index 1a617dcd494d..b460557da034 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -561,7 +561,7 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target) if (unpacked_api) { mixed_pass_list.push_back(tir::transform::MakeUnpackedAPI()); } else { - mixed_pass_list.push_back(tir::transform::MakePackedAPI(-1)); + mixed_pass_list.push_back(tir::transform::MakePackedAPI()); } mixed_pass_list.push_back(tir::transform::SplitHostDevice()); diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc index 4f8ad1223cd2..bf7ff09c86c7 100644 --- a/src/tir/transforms/make_packed_api.cc +++ b/src/tir/transforms/make_packed_api.cc @@ -139,7 +139,7 @@ inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, std::string msg) { return AssertStmt(lhs == rhs, tvm::tir::StringImm(msg), Evaluate(0)); } -PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) { +PrimFunc MakePackedAPI(PrimFunc&& func) { auto global_symbol = func->GetAttr(tvm::attr::kGlobalSymbol); ICHECK(global_symbol) << "MakePackedAPI: Expect PrimFunc to have the global_symbol attribute"; @@ -152,14 +152,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) { auto* func_ptr = func.CopyOnWrite(); const Stmt nop = Evaluate(0); int num_args = static_cast(func_ptr->params.size()); - ICHECK_LE(num_unpacked_args, num_args); - bool pack_args = (num_unpacked_args == -1) || (num_args > num_unpacked_args); - if (num_unpacked_args == -1) { - // reset to zero - num_unpacked_args = 0; - } - ICHECK_GE(num_unpacked_args, 0); - int num_packed_args = num_args - num_unpacked_args; + // Data field definitions // The packed fields Var v_packed_args("args", DataType::Handle()); @@ -170,7 +163,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) { Var v_out_ret_tcode("out_ret_tcode", PointerType(PrimType(DataType::Int(32)))); Var v_resource_handle("resource_handle", DataType::Handle()); // The arguments of the function. - Array args; + // The device context Var device_id("dev_id"); Integer device_type(target_device_type); @@ -194,14 +187,6 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) { } return res; }; - // --------------------------- - // start of logics - // add signature for packed arguments. - if (pack_args) { - args.push_back(v_packed_args); - args.push_back(buf_packed_arg_type_ids->data); - args.push_back(v_num_packed_args); - } // Need to re-declare vars, in case some arguments also appears in the buffer. std::vector> var_def; @@ -219,7 +204,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) { // Pluck the device API context out based on name if (param->name_hint == kDeviceContextVar) { - num_packed_args--; + num_args--; v_resource_handle = param; continue; } @@ -232,44 +217,34 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) { var_def.emplace_back(v_arg, param); } - if (i < num_packed_args) { - // Value loads - seq_init.emplace_back(LetStmt(v_arg, f_arg_value(v_arg.dtype(), i), nop)); - // type code checks - Var tcode(v_arg->name_hint + ".code", DataType::Int(32)); - seq_init.emplace_back( - LetStmt(tcode, BufferLoad(buf_packed_arg_type_ids, {IntImm(DataType::Int(32), i)}), nop)); - DataType t = v_arg.dtype(); - if (t.is_handle()) { - std::ostringstream msg; - msg << name_hint << ": Expect arg[" << i << "] to be pointer"; - seq_check.emplace_back(AssertStmt(tcode == kTVMOpaqueHandle || tcode == kTVMNDArrayHandle || - tcode == kTVMDLTensorHandle || tcode == kTVMNullptr, - tvm::tir::StringImm(msg.str()), nop)); - } else if (t.is_int() || t.is_uint()) { - std::ostringstream msg; - msg << name_hint << ": Expect arg[" << i << "] to be int"; - seq_check.emplace_back(AssertStmt(tcode == kDLInt, tvm::tir::StringImm(msg.str()), nop)); - } else { - ICHECK(t.is_float()); - std::ostringstream msg; - msg << name_hint << ": Expect arg[" << i << "] to be float"; - seq_check.emplace_back(AssertStmt(tcode == kDLFloat, tvm::tir::StringImm(msg.str()), nop)); - } + // Value loads + seq_init.emplace_back(LetStmt(v_arg, f_arg_value(v_arg.dtype(), i), nop)); + // type code checks + Var tcode(v_arg->name_hint + ".code", DataType::Int(32)); + seq_init.emplace_back( + LetStmt(tcode, BufferLoad(buf_packed_arg_type_ids, {IntImm(DataType::Int(32), i)}), nop)); + DataType t = v_arg.dtype(); + if (t.is_handle()) { + std::ostringstream msg; + msg << name_hint << ": Expect arg[" << i << "] to be pointer"; + seq_check.emplace_back(AssertStmt(tcode == kTVMOpaqueHandle || tcode == kTVMNDArrayHandle || + tcode == kTVMDLTensorHandle || tcode == kTVMNullptr, + tvm::tir::StringImm(msg.str()), nop)); + } else if (t.is_int() || t.is_uint()) { + std::ostringstream msg; + msg << name_hint << ": Expect arg[" << i << "] to be int"; + seq_check.emplace_back(AssertStmt(tcode == kDLInt, tvm::tir::StringImm(msg.str()), nop)); } else { - args.push_back(v_arg); + ICHECK(t.is_float()); + std::ostringstream msg; + msg << name_hint << ": Expect arg[" << i << "] to be float"; + seq_check.emplace_back(AssertStmt(tcode == kDLFloat, tvm::tir::StringImm(msg.str()), nop)); } } - // allow return value if the function is packed. - if (pack_args) { - args.push_back(v_out_ret_value); - args.push_back(v_out_ret_tcode); - args.push_back(v_resource_handle); - } - - size_t expected_nargs = num_unpacked_args + (pack_args ? 6 : 0); - ICHECK_EQ(args.size(), expected_nargs); + Array args{v_packed_args, buf_packed_arg_type_ids->data, + v_num_packed_args, v_out_ret_value, + v_out_ret_tcode, v_resource_handle}; // Arg definitions are defined before buffer binding to avoid the use before // def errors. @@ -286,9 +261,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) { binder.BindDLTensor(kv.second, device_type, device_id, kv.first, kv.first->name_hint); } - if (num_unpacked_args == 0) { - func = WithAttr(std::move(func), tvm::attr::kCallingConv, Integer(CallingConv::kCPackedFunc)); - } + func = WithAttr(std::move(func), tvm::attr::kCallingConv, Integer(CallingConv::kCPackedFunc)); Stmt body = RewriteReturn(func_ptr->body, v_out_ret_value, v_out_ret_tcode); body = AttrStmt(make_zero(DataType::Int(32)), attr::compute_scope, @@ -307,16 +280,11 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) { } } - if (pack_args) { - std::ostringstream num_args_error; - num_args_error << name_hint << ": num_args should be " << num_packed_args; - std::vector arg_assert = { - MakeAssertEQ(v_num_packed_args, num_packed_args, num_args_error.str())}; - func_ptr->body = - MergeNest({arg_assert, seq_init, binder.init_nest(), seq_check, binder.asserts()}, body); - } else { - func_ptr->body = MergeNest({seq_init, binder.init_nest(), seq_check, binder.asserts()}, body); - } + std::ostringstream num_args_error; + num_args_error << name_hint << ": num_args should be " << num_args; + std::vector arg_assert = {MakeAssertEQ(v_num_packed_args, num_args, num_args_error.str())}; + func_ptr->body = + MergeNest({arg_assert, seq_init, binder.init_nest(), seq_check, binder.asserts()}, body); func_ptr->params = args; Array undefined = UndefinedVars(func_ptr->body, func_ptr->params); @@ -339,9 +307,8 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) { namespace transform { -Pass MakePackedAPI(int num_unpacked_args) { - // packed arguments anyway while `num_unpacked_args` is -1 - auto pass_func = [num_unpacked_args](IRModule m, PassContext ctx) { +Pass MakePackedAPI() { + auto pass_func = [](IRModule m, PassContext ctx) { IRModuleNode* mptr = m.CopyOnWrite(); std::vector> updates; @@ -350,7 +317,7 @@ Pass MakePackedAPI(int num_unpacked_args) { PrimFunc func = GetRef(n); if (func->GetAttr(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) == CallingConv::kDefault) { - auto updated_func = MakePackedAPI(std::move(func), num_unpacked_args); + auto updated_func = MakePackedAPI(std::move(func)); updates.push_back({kv.first, updated_func}); } } @@ -365,7 +332,7 @@ Pass MakePackedAPI(int num_unpacked_args) { return tvm::transform::CreateModulePass(pass_func, 0, "tir.MakePackedAPI", {}); } -TVM_REGISTER_GLOBAL("tir.transform.MakePackedAPI").set_body_typed(MakePackedAPI); +TVM_REGISTER_GLOBAL("tir.transform.MakePackedAPI").set_body_typed([]() { return MakePackedAPI(); }); } // namespace transform } // namespace tir } // namespace tvm diff --git a/tests/python/unittest/test_tir_transform_make_packed_api.py b/tests/python/unittest/test_tir_transform_make_packed_api.py index 047c95b6134f..e78ed98d8569 100644 --- a/tests/python/unittest/test_tir_transform_make_packed_api.py +++ b/tests/python/unittest/test_tir_transform_make_packed_api.py @@ -39,9 +39,8 @@ def test_makeapi(): ) )(mod) - num_unpacked_args = 2 - f = tvm.tir.transform.MakePackedAPI(num_unpacked_args)(mod)["main"] - assert len(f.params) == 8 + f = tvm.tir.transform.MakePackedAPI()(mod)["main"] + assert len(f.params) == 6 def _find_assignment(stmt, var_name): From 7dbc68d1087e2ade75314f8b0525e30fc5c6b801 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Tue, 27 Sep 2022 06:26:37 -0700 Subject: [PATCH 258/704] [ONNX] Fix test_roi_align failure (#12906) RoiAlign-16 introduces coordinate_transformation_mode, which should be set to 'output_half_pixel' to omit the pixel shift for the input (for a backward-compatible behavior). This PR should fix the failure in https://ci.tlcpack.ai/job/docker-images-ci/job/docker-image-run-tests/231/testReport/junit/cython.tests.python.frontend.onnx/test_forward/Test___frontend__GPU_3_of_6___test_roi_align_cuda_/ Co-authored-by: Sevin F. Varoglu Co-authored-by: driazati --- Jenkinsfile | 20 ++++++++++---------- ci/jenkins/Jenkinsfile.j2 | 20 ++++++++++---------- tests/python/frontend/onnx/test_forward.py | 1 + 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index a61ab1cd69a2..c49eb66711c7 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -49,16 +49,16 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> -ci_lint = 'tlcpack/ci-lint:20220908-060034-62bdc91b1' -ci_gpu = 'tlcpack/ci-gpu:20220908-060034-62bdc91b1' -ci_cpu = 'tlcpack/ci-cpu:20220908-060034-62bdc91b1' -ci_minimal = 'tlcpack/ci-minimal:20220908-060034-62bdc91b1' -ci_wasm = 'tlcpack/ci-wasm:20220908-060034-62bdc91b1' -ci_i386 = 'tlcpack/ci-i386:20220908-060034-62bdc91b1' -ci_cortexm = 'tlcpack/ci-cortexm:20220909-090211-cb08a1251' -ci_arm = 'tlcpack/ci-arm:20220908-060034-62bdc91b1' -ci_hexagon = 'tlcpack/ci-hexagon:20220908-060034-62bdc91b1' -ci_riscv = 'tlcpack/ci-riscv:20220908-060034-62bdc91b1' +ci_lint = 'tlcpack/ci-lint:20220925-060158-71f25b3d6' +ci_gpu = 'tlcpack/ci-gpu:20220925-060158-71f25b3d6' +ci_cpu = 'tlcpack/ci-cpu:20220925-060158-71f25b3d6' +ci_minimal = 'tlcpack/ci-minimal:20220925-060158-71f25b3d6' +ci_wasm = 'tlcpack/ci-wasm:20220925-060158-71f25b3d6' +ci_i386 = 'tlcpack/ci-i386:20220925-060158-71f25b3d6' +ci_cortexm = 'tlcpack/ci-cortexm:20220925-060158-71f25b3d6' +ci_arm = 'tlcpack/ci-arm:20220925-060158-71f25b3d6' +ci_hexagon = 'tlcpack/ci-hexagon:20220925-060158-71f25b3d6' +ci_riscv = 'tlcpack/ci-riscv:20220925-060158-71f25b3d6' // <--- End of regex-scanned config. // Parameters to allow overriding (in Jenkins UI), the images diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2 index 6ba0c2df8efd..2fcbc9e7e042 100644 --- a/ci/jenkins/Jenkinsfile.j2 +++ b/ci/jenkins/Jenkinsfile.j2 @@ -51,16 +51,16 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils {% import 'ci/jenkins/macros.j2' as m with context -%} // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> -ci_lint = 'tlcpack/ci-lint:20220908-060034-62bdc91b1' -ci_gpu = 'tlcpack/ci-gpu:20220908-060034-62bdc91b1' -ci_cpu = 'tlcpack/ci-cpu:20220908-060034-62bdc91b1' -ci_minimal = 'tlcpack/ci-minimal:20220908-060034-62bdc91b1' -ci_wasm = 'tlcpack/ci-wasm:20220908-060034-62bdc91b1' -ci_i386 = 'tlcpack/ci-i386:20220908-060034-62bdc91b1' -ci_cortexm = 'tlcpack/ci-cortexm:20220909-090211-cb08a1251' -ci_arm = 'tlcpack/ci-arm:20220908-060034-62bdc91b1' -ci_hexagon = 'tlcpack/ci-hexagon:20220908-060034-62bdc91b1' -ci_riscv = 'tlcpack/ci-riscv:20220908-060034-62bdc91b1' +ci_lint = 'tlcpack/ci-lint:20220925-060158-71f25b3d6' +ci_gpu = 'tlcpack/ci-gpu:20220925-060158-71f25b3d6' +ci_cpu = 'tlcpack/ci-cpu:20220925-060158-71f25b3d6' +ci_minimal = 'tlcpack/ci-minimal:20220925-060158-71f25b3d6' +ci_wasm = 'tlcpack/ci-wasm:20220925-060158-71f25b3d6' +ci_i386 = 'tlcpack/ci-i386:20220925-060158-71f25b3d6' +ci_cortexm = 'tlcpack/ci-cortexm:20220925-060158-71f25b3d6' +ci_arm = 'tlcpack/ci-arm:20220925-060158-71f25b3d6' +ci_hexagon = 'tlcpack/ci-hexagon:20220925-060158-71f25b3d6' +ci_riscv = 'tlcpack/ci-riscv:20220925-060158-71f25b3d6' // <--- End of regex-scanned config. // Parameters to allow overriding (in Jenkins UI), the images diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index 17a0513844ba..da6f5785023d 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -4481,6 +4481,7 @@ def verify_roi_align( node = helper.make_node( "RoiAlign", + coordinate_transformation_mode="output_half_pixel", inputs=["X", "rois", "batch_indices"], outputs=["Y"], mode=mode, From 77d8eef5148da6517e471b52fec61ab40ea2436d Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Tue, 27 Sep 2022 11:39:19 -0500 Subject: [PATCH 259/704] [Runtime][Bugfix] Added type-checking for Array::insert (#12691) Prior to this commit, the following code would compile and run without error. This occurs because the typed `Array::insert` calls the untyped `ArrayNode::InitRange`, with no type-checking done before the call. ```c++ Var x("x"); Var y("y"); Array var_arr{x, y}; Array expr_arr{x + 1, y + 2}; // Erroneously inserts static-type PrimExpr, runtime-type Add, even // though neither PrimExpr is a type of Var. var_arr.insert(var_arr.begin(), expr_arr.begin(), expr_arr.end()); ``` After this commit, a `static_assert` in `Array::insert` and in `Array::Array(IterType,IterTYpe)` restricts the iterators, such that they must dereference to `T`, `Optional`, a subclass of `T`, or `Optional` where `U` is a subclass of `T`. The public method `ArrayNode::SetItem` exposes a similar issue. In the future, we may want to make it be private, accessed only through type-safe method in `Array::Set`. --- include/tvm/runtime/container/array.h | 5 +++++ src/te/schedule/schedule_lang.cc | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/tvm/runtime/container/array.h b/include/tvm/runtime/container/array.h index 11bacb18e92c..1b735e73c386 100644 --- a/include/tvm/runtime/container/array.h +++ b/include/tvm/runtime/container/array.h @@ -325,6 +325,8 @@ class Array : public ObjectRef { */ template Array(IterType first, IterType last) { + static_assert(is_valid_iterator_v, + "IterType cannot be inserted into a tvm::Array"); Assign(first, last); } @@ -481,6 +483,9 @@ class Array : public ObjectRef { */ template void insert(iterator position, IterType first, IterType last) { + static_assert(is_valid_iterator_v, + "IterType cannot be inserted into a tvm::Array"); + if (first == last) { return; } diff --git a/src/te/schedule/schedule_lang.cc b/src/te/schedule/schedule_lang.cc index 0fcd6133c4a2..e8f4f65eb651 100644 --- a/src/te/schedule/schedule_lang.cc +++ b/src/te/schedule/schedule_lang.cc @@ -200,7 +200,7 @@ Stage& Stage::env_threads(Array threads) { ICHECK_EQ(self->env_threads.size(), 0U) << "Already set env_threads"; Array& leaf_vars = self->leaf_iter_vars; Array& all_vars = self->all_iter_vars; - std::vector temp; + std::vector temp; for (IterVar iv : threads) { temp.push_back(iv); } From 9a673faa74ed7cd715a4e011716bcce3fd2158b6 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Tue, 27 Sep 2022 09:41:34 -0700 Subject: [PATCH 260/704] [ci] Initialize git during deploys (#12909) We rely on some utilities scripts in the deploy steps so they also need a git checkout. Fixes issues like https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/4345/pipeline Co-authored-by: driazati --- Jenkinsfile | 14 ++++++++------ ci/jenkins/Deploy.groovy.j2 | 10 ++++++---- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index c49eb66711c7..e964ac79a3ce 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -45,7 +45,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2022-09-16T08:47:49.743918 +// Generated at 2022-09-26T10:48:49.577077 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> @@ -4246,7 +4246,8 @@ def deploy() { node('CPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docker") { timeout(time: max_time, unit: 'MINUTES') { - try { + init_git() + try { withCredentials([string( credentialsId: 'dockerhub-tlcpackstaging-key', variable: 'DOCKERHUB_KEY', @@ -4290,10 +4291,11 @@ def deploy() { node('CPU') { ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/tag-images") { timeout(time: max_time, unit: 'MINUTES') { - withCredentials([string( - credentialsId: 'dockerhub-tlcpack-key', - variable: 'TLCPACK_TOKEN', - )]) { + init_git() + withCredentials([string( + credentialsId: 'dockerhub-tlcpack-key', + variable: 'TLCPACK_TOKEN', + )]) { try { sh( script: 'echo $TLCPACK_TOKEN | docker login --username octomldriazati --password-stdin', diff --git a/ci/jenkins/Deploy.groovy.j2 b/ci/jenkins/Deploy.groovy.j2 index 9812e1113598..798af6736e1e 100644 --- a/ci/jenkins/Deploy.groovy.j2 +++ b/ci/jenkins/Deploy.groovy.j2 @@ -99,6 +99,7 @@ def deploy() { feature_flag="env.DEPLOY_DOCKER_IMAGES == 'yes' && rebuild_docker_images && upstream_revision != null", ws="tvm/deploy-docker", ) %} + init_git() try { withCredentials([string( credentialsId: 'dockerhub-tlcpackstaging-key', @@ -130,10 +131,11 @@ def deploy() { feature_flag="env.DOCS_DEPLOY_ENABLED == 'yes'", ws="tvm/tag-images", ) %} - withCredentials([string( - credentialsId: 'dockerhub-tlcpack-key', - variable: 'TLCPACK_TOKEN', - )]) { + init_git() + withCredentials([string( + credentialsId: 'dockerhub-tlcpack-key', + variable: 'TLCPACK_TOKEN', + )]) { try { sh( script: 'echo $TLCPACK_TOKEN | docker login --username octomldriazati --password-stdin', From 332b1469b71cf7eb5e40ec385eb9664f3959643a Mon Sep 17 00:00:00 2001 From: Venkat Rasagna Komatireddy <89959097+rasagna-quic@users.noreply.github.com> Date: Tue, 27 Sep 2022 22:50:13 +0530 Subject: [PATCH 261/704] [Hexagon] depth_to_space slice op (#12669) hexagon slice depth_to_space op --- python/tvm/topi/hexagon/slice_ops/__init__.py | 1 + .../topi/hexagon/slice_ops/depth_to_space.py | 43 ++++++ .../test_hexagon/topi/test_depth_to_space.py | 136 ++++++++++++++++++ 3 files changed, 180 insertions(+) create mode 100644 python/tvm/topi/hexagon/slice_ops/depth_to_space.py create mode 100644 tests/python/contrib/test_hexagon/topi/test_depth_to_space.py diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py index b96156dc46d2..5f86e706af50 100644 --- a/python/tvm/topi/hexagon/slice_ops/__init__.py +++ b/python/tvm/topi/hexagon/slice_ops/__init__.py @@ -35,3 +35,4 @@ from .relu import relu_compute, relu_stir_schedule from .tanh import tanh_te_compute, tanhf16_schedule from .dwconv2d import * +from .depth_to_space import d2s_compute, d2s_schedule diff --git a/python/tvm/topi/hexagon/slice_ops/depth_to_space.py b/python/tvm/topi/hexagon/slice_ops/depth_to_space.py new file mode 100644 index 000000000000..aa14a97f5ee9 --- /dev/null +++ b/python/tvm/topi/hexagon/slice_ops/depth_to_space.py @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" Compute and schedule for depth to space slice op +""" + +from tvm import te, tir, topi +from ..utils import get_layout_transform_fn + + +def d2s_compute(inp, block_size, layout, mode): + """depth_to_space compute""" + return topi.nn.depth_to_space(inp, block_size=block_size, layout=layout, mode=mode) + + +def d2s_schedule(inp, out, input_layout, output_layout): + """Schedule for depth to space: top level function""" + if (input_layout != output_layout) or ( + output_layout not in ("nhwc-8h2w32c2w-2d", "nhwc-8h8w32c-2d") + ): + raise RuntimeError( + f"Unexpected input_layout, output_layout '{input_layout, output_layout}'" + ) + d2s_func = te.create_prim_func([inp, out]) + sch = tir.Schedule(d2s_func, debug_mask="all") + compute = sch.get_block("depth_to_space") + sch.transform_layout(compute, inp.name, get_layout_transform_fn(input_layout)) + sch.transform_layout(compute, out.name, get_layout_transform_fn(output_layout)) + return sch diff --git a/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py b/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py new file mode 100644 index 000000000000..f74d13f641d5 --- /dev/null +++ b/tests/python/contrib/test_hexagon/topi/test_depth_to_space.py @@ -0,0 +1,136 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=line-too-long, redefined-outer-name + +"""Test depth_to_space slice op for hexagon""" + +import numpy as np +import pytest + +import tvm +from tvm import te +import tvm.testing +from tvm.topi.hexagon.slice_ops.depth_to_space import d2s_compute, d2s_schedule +from tvm.topi.testing import depth_to_space_python + +from ..infrastructure import allocate_hexagon_array, transform_numpy + + +d2s_fp16_tests = ( + ((1, 8, 8, 256), 2, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"), + ((1, 8, 8, 1024), 4, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"), + ((1, 16, 16, 256), 2, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"), + ((1, 16, 16, 1024), 4, "CDR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"), + ((1, 8, 8, 256), 2, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"), + ((1, 8, 8, 1024), 4, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"), + ((1, 16, 16, 256), 2, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"), + ((1, 16, 16, 1024), 4, "DCR", "float16", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d"), +) + +d2s_uint8_tests = ( + ((1, 8, 8, 256), 2, "CDR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"), + ((1, 8, 8, 1024), 4, "CDR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"), + ((1, 8, 8, 256), 2, "DCR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"), + ((1, 8, 8, 1024), 4, "DCR", "uint8", "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d"), +) + + +class TestD2SSlice: + """Test class that defines the Depth to Space slice test""" + + (input_shape, block_size, mode, dtype, input_layout, output_layout,) = tvm.testing.parameters( + *d2s_fp16_tests, + *d2s_uint8_tests, + ) + + working_scope = tvm.testing.parameter("global.vtcm") + + @tvm.testing.fixture + def input_np(self, input_shape, dtype): + return np.random.uniform(size=input_shape).astype(dtype) + + @tvm.testing.fixture + def transformed_input_np(self, input_np, input_layout): + return transform_numpy(input_np, "nhwc", input_layout) + + @tvm.testing.fixture + def ref_output_np(self, input_np, block_size, mode): + a_np = np.transpose(input_np, axes=[0, 3, 1, 2]) + ref_np = depth_to_space_python(a_np, block_size, mode=mode) + ref_np = np.transpose(ref_np, axes=[0, 2, 3, 1]) + return ref_np + + @tvm.testing.fixture + def transformed_ref_output_np(self, ref_output_np, output_layout): + return transform_numpy(ref_output_np, "nhwc", output_layout) + + @tvm.testing.requires_hexagon + def test_d2s_slice( + self, + input_shape, + block_size, + mode, + dtype, + input_layout, + output_layout, + hexagon_session, + working_scope, + transformed_input_np, + transformed_ref_output_np, + ): + """Top level testing function for depth to space""" + Input = te.placeholder(input_shape, name="Input", dtype=dtype) + + Output = d2s_compute(Input, block_size, "NHWC", mode) + + target_hexagon = tvm.target.hexagon("v69") + target = tvm.target.Target(target_hexagon, host=target_hexagon) + + tir_s = d2s_schedule(Input, Output, input_layout, output_layout) + + input_data = allocate_hexagon_array( + hexagon_session.device, + data=transformed_input_np, + axis_separators=[4], + mem_scope=working_scope, + ) + output_data = allocate_hexagon_array( + hexagon_session.device, + tensor_shape=transformed_ref_output_np.shape, + dtype=transformed_ref_output_np.dtype, + axis_separators=[4], + mem_scope=working_scope, + ) + with tvm.transform.PassContext(opt_level=3): + runtime_module = tvm.build( + tir_s.mod, [Input, Output], target=target, name="depth_to_space" + ) + mod = hexagon_session.load_module(runtime_module) + + mod(input_data, output_data) + output_np = output_data.numpy() + + tvm.testing.assert_allclose( + output_np, + transformed_ref_output_np, + 1e-3, + 1e-3, + ) + + +if __name__ == "__main__": + tvm.testing.main() From 5a807e27c04a4dcb80ab35e8e601f5a9d5e78986 Mon Sep 17 00:00:00 2001 From: Janet Schneider Date: Tue, 27 Sep 2022 10:35:44 -0700 Subject: [PATCH 262/704] [Hexagon] [runtime] Add thread manager to resource management (#12905) * Add thread manager to Acquire/ReleaseResources * Change logging to debug logs * Fix lint * Increase pipe size --- src/runtime/hexagon/hexagon_device_api.h | 26 ++++++++++++++++--- .../hexagon/hexagon_device_api_tests.cc | 11 ++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h index b8861238771b..4f544faffba1 100644 --- a/src/runtime/hexagon/hexagon_device_api.h +++ b/src/runtime/hexagon/hexagon_device_api.h @@ -31,6 +31,7 @@ #include "hexagon_buffer.h" #include "hexagon_buffer_manager.h" +#include "hexagon_thread_manager.h" namespace tvm { namespace runtime { @@ -54,17 +55,26 @@ class HexagonDeviceAPI final : public DeviceAPI { void AcquireResources() { CHECK_EQ(runtime_hexbuffs, nullptr); runtime_hexbuffs = std::make_unique(); - LOG(INFO) << "runtime_hexbuffs created"; + DLOG(INFO) << "runtime_hexbuffs created"; mgr = runtime_hexbuffs.get(); + + CHECK_EQ(runtime_threads, nullptr); + runtime_threads = std::make_unique(threads, stack_size, pipe_size); + DLOG(INFO) << "runtime_threads created"; } //! \brief Ensures all runtime resources are freed void ReleaseResources() { + CHECK(runtime_threads) << "runtime_threads was not created in AcquireResources"; + runtime_threads.reset(); + DLOG(INFO) << "runtime_threads reset"; + + CHECK(runtime_hexbuffs) << "runtime_hexbuffs was not created in AcquireResources"; if (runtime_hexbuffs && !runtime_hexbuffs->empty()) { - LOG(INFO) << "runtime_hexbuffs was not empty in ReleaseResources"; + DLOG(INFO) << "runtime_hexbuffs was not empty in ReleaseResources"; } mgr = &hexbuffs; - LOG(INFO) << "runtime_hexbuffs reset"; + DLOG(INFO) << "runtime_hexbuffs reset"; runtime_hexbuffs.reset(); } @@ -139,6 +149,10 @@ class HexagonDeviceAPI final : public DeviceAPI { */ void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final; + HexagonThreadManager* ThreadManager() { + return runtime_threads ? runtime_threads.get() : nullptr; + } + protected: //! Standard Device API interface to copy data from one storage to another. void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, @@ -164,6 +178,12 @@ class HexagonDeviceAPI final : public DeviceAPI { //! \brief Current buffer manager HexagonBufferManager* mgr; + + //! \brief Thread manager + std::unique_ptr runtime_threads; + const unsigned threads{6}; + const unsigned pipe_size{1000}; + const unsigned stack_size{0x4000}; // 16KB }; } // namespace hexagon } // namespace runtime diff --git a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc index 1827c4059dea..b54e40e87958 100644 --- a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc +++ b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc @@ -164,3 +164,14 @@ TEST_F(HexagonDeviceAPITest, leak_resources) { hexapi->FreeDataSpace(hex_dev, pre_runtime_buf); hexapi->AcquireResources(); } + +// Ensure thread manager is properly configured and destroyed +// in Acquire/Release +TEST_F(HexagonDeviceAPITest, thread_manager) { + HexagonThreadManager* threads = hexapi->ThreadManager(); + CHECK(threads != nullptr); + hexapi->ReleaseResources(); + threads = hexapi->ThreadManager(); + CHECK(threads == nullptr); + hexapi->AcquireResources(); +} From 82e6fc41f8069fdaf98991faee31e21f77e2cf8c Mon Sep 17 00:00:00 2001 From: Mohamad Katanbaf Date: Tue, 27 Sep 2022 13:55:45 -0700 Subject: [PATCH 263/704] [microTVM] add the option to open a saved micro project for debugging (#12495) * add the option to open a saved project for debugging. * addressing comments Co-authored-by: Mohamad --- python/tvm/micro/build.py | 26 ++++++++++-- python/tvm/micro/session.py | 56 ++++++++++++++++++-------- python/tvm/micro/testing/evaluation.py | 39 ++++++++++++------ 3 files changed, 88 insertions(+), 33 deletions(-) diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py index 795a61edcbb3..92574ce2f8c2 100644 --- a/python/tvm/micro/build.py +++ b/python/tvm/micro/build.py @@ -20,7 +20,6 @@ import json import logging import os -import pathlib import contextlib import enum @@ -115,23 +114,40 @@ class AutoTvmModuleLoader: Parameters ---------- - template_project_dir : Union[pathlib.Path, str] + template_project_dir : Union[os.PathLike, str] project template path project_options : dict project generation option + + project_dir: str + if use_existing is False: The path to save the generated microTVM Project. + if use_existing is True: The path to a generated microTVM Project for debugging. + + use_existing: bool + skips the project generation and opens transport to the project at the project_dir address. """ def __init__( - self, template_project_dir: Union[pathlib.Path, str], project_options: dict = None + self, + template_project_dir: Union[os.PathLike, str], + project_options: dict = None, + project_dir: Union[os.PathLike, str] = None, + use_existing: bool = False, ): self._project_options = project_options + self._use_existing = use_existing - if isinstance(template_project_dir, (pathlib.Path, str)): + if isinstance(template_project_dir, (os.PathLike, str)): self._template_project_dir = str(template_project_dir) elif not isinstance(template_project_dir, str): raise TypeError(f"Incorrect type {type(template_project_dir)}.") + if isinstance(project_dir, (os.PathLike, str)): + self._project_dir = str(project_dir) + else: + self._project_dir = None + @contextlib.contextmanager def __call__(self, remote_kw, build_result): with open(build_result.filename, "rb") as build_file: @@ -147,6 +163,8 @@ def __call__(self, remote_kw, build_result): build_result_bin, self._template_project_dir, json.dumps(self._project_options), + self._project_dir, + self._use_existing, ], ) system_lib = remote.get_function("runtime.SystemLib")() diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py index 967eaee62958..8a51f1082dda 100644 --- a/python/tvm/micro/session.py +++ b/python/tvm/micro/session.py @@ -20,7 +20,10 @@ import json import logging import sys - +import os +import pathlib +import shutil +from typing import Union from ..error import register_error from .._ffi import get_global_func, register_func from ..contrib import graph_executor @@ -259,6 +262,8 @@ def compile_and_create_micro_session( mod_src_bytes: bytes, template_project_dir: str, project_options: dict = None, + project_dir: Union[os.PathLike, str] = None, + use_existing: bool = False, ): """Compile the given libraries and sources into a MicroBinary, then invoke create_micro_session. @@ -275,25 +280,44 @@ def compile_and_create_micro_session( project_options: dict Options for the microTVM API Server contained in template_project_dir. - """ - temp_dir = utils.tempdir() - # Keep temp directory for generate project - temp_dir.set_keep_for_debug(True) - model_library_format_path = temp_dir / "model.tar.gz" - with open(model_library_format_path, "wb") as mlf_f: - mlf_f.write(mod_src_bytes) + project_dir: Union[os.PathLike, str] + if use_existing is False: The path to save the generated microTVM Project. + if use_existing is True: The path to a generated microTVM Project for debugging. - try: - template_project = project.TemplateProject.from_directory(template_project_dir) - generated_project = template_project.generate_project_from_mlf( - model_library_format_path, - str(temp_dir / "generated-project"), + use_existing: bool + skips the project generation and opens transport to the project at the project_dir address. + """ + + if use_existing: + project_dir = pathlib.Path(project_dir) + assert project_dir.is_dir(), f"{project_dir} does not exist." + build_dir = project_dir / "generated-project" / "build" + shutil.rmtree(build_dir) + generated_project = project.GeneratedProject.from_directory( + project_dir / "generated-project", options=json.loads(project_options), ) - except Exception as exception: - logging.error("Project Generate Error: %s", str(exception)) - raise exception + else: + if project_dir: + temp_dir = utils.tempdir(custom_path=project_dir, keep_for_debug=True) + else: + temp_dir = utils.tempdir() + + model_library_format_path = temp_dir / "model.tar.gz" + with open(model_library_format_path, "wb") as mlf_f: + mlf_f.write(mod_src_bytes) + + try: + template_project = project.TemplateProject.from_directory(template_project_dir) + generated_project = template_project.generate_project_from_mlf( + model_library_format_path, + str(temp_dir / "generated-project"), + options=json.loads(project_options), + ) + except Exception as exception: + logging.error("Project Generate Error: %s", str(exception)) + raise exception generated_project.build() generated_project.flash() diff --git a/python/tvm/micro/testing/evaluation.py b/python/tvm/micro/testing/evaluation.py index c8a90ff5b40f..1d80ed5568b2 100644 --- a/python/tvm/micro/testing/evaluation.py +++ b/python/tvm/micro/testing/evaluation.py @@ -27,6 +27,7 @@ from pathlib import Path from contextlib import ExitStack import tempfile +import shutil import tvm from tvm.relay.op.contrib import cmsisnn @@ -53,6 +54,7 @@ def tune_model( "project_type": "host_driven", **(project_options or {}), } + module_loader = tvm.micro.AutoTvmModuleLoader( template_project_dir=tvm.micro.get_microtvm_template_projects(platform), project_options=project_options, @@ -99,6 +101,7 @@ def create_aot_session( timeout_override=None, use_cmsis_nn=False, project_options=None, + use_existing=False, ): """AOT-compiles and uploads a model to a microcontroller, and returns the RPC session""" @@ -125,21 +128,31 @@ def create_aot_session( parameter_size = len(tvm.runtime.save_param_dict(lowered.get_params())) print(f"Model parameter size: {parameter_size}") - project = tvm.micro.generate_project( - str(tvm.micro.get_microtvm_template_projects(platform)), - lowered, - build_dir / "project", - { - f"{platform}_board": board, - "project_type": "host_driven", - # {} shouldn't be the default value for project options ({} - # is mutable), so we use this workaround - **(project_options or {}), - }, - ) + project_options = { + f"{platform}_board": board, + "project_type": "host_driven", + # {} shouldn't be the default value for project options ({} + # is mutable), so we use this workaround + **(project_options or {}), + } + + if use_existing: + shutil.rmtree(build_dir / "project" / "build") + project = tvm.micro.GeneratedProject.from_directory( + build_dir / "project", + options=project_options, + ) + + else: + project = tvm.micro.generate_project( + str(tvm.micro.get_microtvm_template_projects(platform)), + lowered, + build_dir / "project", + project_options, + ) + project.build() project.flash() - return tvm.micro.Session(project.transport(), timeout_override=timeout_override) From a07a46ed19edcb41ef72d47299dee0dbb336260e Mon Sep 17 00:00:00 2001 From: Christian Convey Date: Tue, 27 Sep 2022 17:08:09 -0400 Subject: [PATCH 264/704] [TIR] add unit-tests for upcoming primfunc-slicing (#12794) [TIR] Add disabled primfunc-slice unit tests Add unit tests (initially disabled) to motivate upcoming work on semi-automated slicing of primfuncs. (I.e., extracting some subtree of a primfunc body's TIR into a separate primfunc.) --- tests/python/unittest/test_slice_tir.py | 216 ++++++++++++++++++++++++ 1 file changed, 216 insertions(+) create mode 100644 tests/python/unittest/test_slice_tir.py diff --git a/tests/python/unittest/test_slice_tir.py b/tests/python/unittest/test_slice_tir.py new file mode 100644 index 000000000000..03cd8f67d6b2 --- /dev/null +++ b/tests/python/unittest/test_slice_tir.py @@ -0,0 +1,216 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +import tvm.testing +from tvm.script import tir as T +import pytest + +# --------------------------------------------------------------------------------------------------- +# ABOUT THIS FILE: +# --------------------------------------------------------------------------------------------------- +# We (cconvey / OctoML) are working on a sequence of PRs to allow a single TIR primfunc's +# AST to be sliced into multiple partitiones, where each partition will be converted into +# a new TIR primfunc. (See https://en.wikipedia.org/wiki/Program_slicing). +# +# The unit tests below provide a roadmap for that sequence of PRs; each PR should allow +# one more of these tests to pass. +# +# NOTE: These unit tests may change as work progresses. They aren't meant to +# indicate hard requirements. + +# NOTE! The `tvm.testing.CompareBeforeAfter` class provides TWO useful mechanisms for +# these tests: +# +# (a) It lets us specify code snippets which are valid Python, but which aren't YET +# recognized as valid TVMScript. This allows unit tests for new constructs, +# e.g. 'call_tir(...)' to simply be disabled rather than fully commented out. +# +# (b) It lets us structurally compare the TIR bodies of two primfuncs. +# +# Note that some of the tests below will require the structural comparison of +# two entire IRModules, not just primfuncs. This will require adding functionality +# to the `CompareBeforeAfter` class, or implementing that level of comparison within +# the individual unit tests. +# +# Some of the unit tests below which require whole-IRModule comparison. For expedience +# we simply comment out the (early draft) bodies of those unit tests, rather than +# hacking their structure to get the benefits of (a). + + +# --------------------------------------------------------------------------------------------------- +# 'CALL_TIR' (AND RELATED) CAVEATS: +# --------------------------------------------------------------------------------------------------- +# (c) "call_tir" is a placeholder name. +# The TVM "Relax" effort also defines a node named "call_tir", which is likely +# become something different from what we're calling "call_tir" here. So +# we may rename *this* "call_tir" during implementation. +# +# (d) For "call_tir" calls, the syntax/semantics for passing buffer regions is still +# an active area of development. So that detail of these unit tests is likely +# to change. +# +# (e) The specific string "extract_as_subroutine" used to annotate some IR Blocks, +# i.e., `T.annotate("extract_as_subroutine", ...)`, may change as work progresses. + + +# --------------------------------------------------------------------------------------------------- +# step 1: Simply passes Python / TVMScript parsing. +# --------------------------------------------------------------------------------------------------- +# +# The only requirement for this test is that the TVMScript parser +# doesn't raise an error when encountering `T.call_tir(foo)`, +# where "foo" is a syntactically valid TVMScript function name. +# +# NOTE! The role of this unit test should evolve as follows: +# 1) Initially the test should fail, because we haven't yet changed the TVMScript +# parser to support 'call_tir'. +# +# 2) Initial TVMScript support for 'call_tir' will be minimal, essentially ignoring +# it. This test should pass once that change is made. +# +# 3) As support for 'call_tir' becomes more complete, this test should once again +# fail, because the specified callee doesn't exist. This test should be updated +# to once again expect failure. +@pytest.mark.xfail(reason="Awaiting TVMScript support for 'call_tir' token.", strict=True) +class TestParseCallTIR(tvm.testing.CompareBeforeAfter): + """ + Simply confirm that the TIR node `call_tir` doesn't interfere with + the successful parsing of the TVMScript. + """ + + def before(): + T.call_tir(add_one) + T.evalute(0) + + def expected(): + T.evaluate(0) + + # Provide a trivial 'transform' pass to satisfy the requirements of + # tvm.testing.CompareBeforeAfter. + transform = tvm.tir.transform.prim_func_pass(lambda func, _mod, _ctx: func, 0) + + +# --------------------------------------------------------------------------------------------------- +# step 2: transform annotated block ==> separate primfuncs + call_tir +# +# NOTE: This early-draft version of the unit test contains pseudocode to compare entire IRModule +# objects, analogously to how tvm.testing.CompareBeforeAfter compares two primfuncs. +# TVM's testing infrastructure currently has no such functionality, and it will need to be added +# (or approximated) to make this unit test useable. +# --------------------------------------------------------------------------------------------------- +@pytest.mark.xfail( + reason="Awaiting TVMScript support for 'call_tir' and T.annotation(\"extract_as_subroutine\").", + strict=True, +) +class TestAnnotateAndSliceTIR(tvm.testing.CompareBeforeAfter): + # def test_annotate_and_slice(): + # @tvm.script.ir_module + # class irmod_before: + # @T.prim_func + # def main(A: T.Buffer[(1,), "int8"): + # #A = T.match_buffer(a, (1,), "int8") + # A[0] = 0 + # with T.block("block_foo"): # optional: give this block a name, perhaps for testing? + # # NOTE: nice to have: human control over name used for the generated callee + # T.annotate("extract_as_subroutine", "add_one") + # A[0] += 1 + # return 42 + # + # @tvm.script.ir_module + # class irmod_after: + # @T.prim_func + # def main(): + # A = T.buffer[[1], "int8"] + # A[0] = 0 + # with T.block("block_foo"): + # call_tir(add_one, A) + # + # @T.prim_func + # def add_one(X: T.buffer[[1], "int8"]): + # X[0] += 1 + pass + + +# --------------------------------------------------------------------------------------------------- +# step 3: transform call_tir ==> packed call +# --------------------------------------------------------------------------------------------------- +@pytest.mark.xfail( + reason="Awaiting TVMScript support for lowering of 'T.call_tir' to 'T.call_packed'.", + strict=True, +) +class TestLowerCallTir(tvm.testing.CompareBeforeAfter): + # @tvm.script.ir_module + # class test_lower_before: + # @T.prim_func + # def main(): + # A = T.buffer[[1], "int8"] + # A[0] = 0 + # with T.block(): + # call_tir(add_one, A) + # + # @T.prim_func + # def add_one(X: T.buffer[[1], "int8"]): + # X[0] += 1 + # + # @tvm.script.ir_module + # class test_lower_after: + # @T.prim_func + # def main(): + # A = T.buffer[[1], "int8"] + # A[0] = 0 + # with T.block(): + # # TODO: figure out the right TVMScript thing to do here + # call_packed(add_one, A) # not sure about this function / interface + # + # @T.prim_func + # def add_one(X: T.buffer[[1], "int8"]): + # X[0] += 1 + # + # TODO(cconvey): additional test logic needed. + # NOTE(lunderberg): Will also need a `transform` defined here. + # I think we'll want it to occur in `tvm.tir.transform.MakePackedAPI`. + pass + + +# --------------------------------------------------------------------------------------------------- +# step 4: end-to-end functionality +# --------------------------------------------------------------------------------------------------- + + +@pytest.mark.xfail(reason="Awaiting end-to-end support for Primfunc slicing.", strict=True) +class TestPrimfuncSlicingEndToEnd(tvm.testing.CompareBeforeAfter): + # @tvm.script.ir_module + # class test_annotate_before: + # @T.prim_func + # def main(): + # A = T.buffer[[1], "int8"] + # A[0] = 0 + # with T.block(): # optional: give this block a name, perhaps for testing? + # # NOTE: nice to have: human control over name used for the generated callee + # T.annotate("extract_as_subroutine", "add_one") + # A[0] += 1 + # assert(A[0] == 1) + # + # TODO(cconvey): additional test logic needed: + # Starting with the IRModule shown above, end up with a running test that + # module actually increments A[0] on Hexagon and x86-64 Linux. + # + # NOTE(lunderberg): We can use the function calls currently generated by `SplitHostDevice` as a template + # (see https://github.com/apache/tvm/blob/9a673faa74ed7cd715a4e011716bcce3fd2158b6/src/tir/transforms/split_host_device.cc#L336). + # Overall, we'll want to output a Call node with the operation builtin::tvm_call_packed(). + pass From bec9f16d42fc11ac97e0f01af007551398b025a2 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Tue, 27 Sep 2022 16:51:08 -0500 Subject: [PATCH 265/704] [TIR][Transform] Clear buffer_map during MakeUnpackedAPI (#12891) * [TIR][Transform] Clear buffer_map during MakeUnpackedAPI This mimics the behavior in `MakePackedAPI`, and is assumed to be the case for some codegens. * Remove read of buffer_map in ethosu.tir_to_cs_translator This previously relied on `MakeUnpackedAPI` preserving the `PrimFunc::buffer_map`, even after it had been used for lowering. It now reads from the `BufferLoad` and `BufferStore` nodes to determine buffer shapes. * Added more documentation for MakePackedAPI/MakeUnpackedAPI --- .../relay/backend/contrib/ethosu/tir/utils.py | 30 +++++++++++++++ .../contrib/ethosu/tir_to_cs_translator.py | 37 +++++++++++++------ python/tvm/tir/transform/transform.py | 30 +++++++++++++++ src/tir/transforms/make_unpacked_api.cc | 7 +--- 4 files changed, 88 insertions(+), 16 deletions(-) diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/utils.py b/python/tvm/relay/backend/contrib/ethosu/tir/utils.py index a823667234df..396735a07c4c 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir/utils.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir/utils.py @@ -158,6 +158,36 @@ def get_outer_loops(stmt, layout): return None +def collect_buffer_map(stmt): + """Collect a map of Var -> Buffer + + Generate a map from a buffer's backing `tir.Var` to the + `tir.Buffer` object that uses it. If multiple such buffers exist, + return the first occurrence. + + Parameters + ---------- + stmt : tvm.tir.Stmt + The statement to get the BufferLoads from. + + Returns + ------- + buffer_map : Dict[Var, Buffer] + The map from buffer var to the buffers that use it. + """ + buffer_map = {} + + def _visit(node): + if isinstance(node, (tvm.tir.BufferLoad, tvm.tir.BufferStore)): + buf = node.buffer + if buf.data not in buffer_map: + buffer_map[buf.data] = buf + + tvm.tir.stmt_functor.post_order_visit(stmt, _visit) + + return buffer_map + + def get_loads(stmt): """Get the BufferLoad statements. diff --git a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py index f5c8994bec77..19f009d284ab 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py @@ -29,6 +29,7 @@ from tvm.relay.backend.contrib.ethosu import util from tvm.relay.backend.contrib.ethosu import vela_api from tvm.relay.backend.contrib.ethosu.tir import spec +from tvm.relay.backend.contrib.ethosu.tir import utils as tir_utils class BufferType(Enum): @@ -254,26 +255,40 @@ def extract_param_base_addresses(mod, buffer_info, scratch_region_map) -> List[u assert len(mod.functions.items()) == 1 primfunc = mod.functions.items()[0][1] + buffer_map = tir_utils.collect_buffer_map(primfunc.body) + base_addresses = list() idx = 0 + for param in primfunc.params: # constants are pooled together and handled specially # this will change after tir.allocate_const. # For now, we are skipping generating buffer addresses here if buffer_info[param].btype == BufferType.constant: continue - buffer = primfunc.buffer_map[param] - dtype = buffer.dtype - element_size_bytes = np.iinfo(dtype).bits // 8 - size_bytes = element_size_bytes * np.prod(list(buffer.shape)) - base_addresses.append( - util.BaseAddress( - param.name.replace("-", "_"), - idx, - _get_region(buffer_info[param].btype, param, scratch_region_map), - size_bytes, + + if param in buffer_map: + buffer = buffer_map[param] + dtype = buffer.dtype + element_size_bytes = np.iinfo(dtype).bits // 8 + size_bytes = element_size_bytes * np.prod(list(buffer.shape)) + base_addresses.append( + util.BaseAddress( + param.name.replace("-", "_"), + idx, + _get_region(buffer_info[param].btype, param, scratch_region_map), + size_bytes, + ) + ) + else: + base_addresses.append( + util.BaseAddress( + param.name.replace("-", "_"), + idx, + _get_region(buffer_info[param].btype, param, scratch_region_map), + 0, + ) ) - ) idx += 1 return base_addresses diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py index 3c1ca196f1b0..d95d15c0dfbe 100644 --- a/python/tvm/tir/transform/transform.py +++ b/python/tvm/tir/transform/transform.py @@ -390,6 +390,26 @@ def LowerCustomDatatypes(): def MakePackedAPI(): """Transform the PrimFuncs in the module to a packed func API. + Prior to this pass, the PrimFunc may have Buffer arguments defined + in the `PrimFuncNode::buffer_map`. This pass consumes the + `buffer_map`, using it to generate `TVMArgs` and `TVMRetValue*` + arguments that implement the `PackedFunc` API. + + For static shapes, the `BufferNode::shape`, `BufferNode::strides`, + and `BufferNode::elem_offset` member variables are used to + generate runtime checks on the corresponding member variables in + the user-provided `DLTensor*` or `tvm.nd.array` argument. (e.g. A + PrimFunc that accepts a buffer of shape `[16,32]` validates that + the `DLTensor::shape` array is `[16,32]`.) + + For dynamic Buffers, in which one or more of these `BufferNode` member + variables use `tir.Var` that are not defined by other PrimFunc + parameters, these are instead used to define the variables based on + the corresponding `DLTensor` members. (e.g. A PrimFunc that accepts a + buffer of shape `[tir.Var("n"), tir.Var("m")]`, when passed a + `DLTensor` of shape `[16,32]`, will define `n = 16` and `n=32`, based + on the argument's shape. + Returns ------- fpass : tvm.transform.Pass @@ -401,6 +421,16 @@ def MakePackedAPI(): def MakeUnpackedAPI(): """Transform the PrimFuncs in the module to a C API compatible with internal calls. + Prior to this pass, the PrimFunc may have Buffer arguments defined in + the `PrimFuncNode::buffer_map`. This pass consumes the `buffer_map`, + using it to generate `T*` arguments (e.g. `float32*`) that can be + directly called by a C API. + + For static shapes, no runtime validation is performed to confirm that + the argument buffer's shape matches the expected shape. For dynamic + shapes, `MakeUnpackedAPI` requires that the dynamic parameters be + passed as separate `tir.Var` parameters. + Returns ------- fpass : tvm.transform.Pass diff --git a/src/tir/transforms/make_unpacked_api.cc b/src/tir/transforms/make_unpacked_api.cc index c57daeabbe1d..87e8f38895cd 100644 --- a/src/tir/transforms/make_unpacked_api.cc +++ b/src/tir/transforms/make_unpacked_api.cc @@ -59,16 +59,13 @@ PrimFunc MakeUnpackedAPI(PrimFunc&& func) { // Collect variables and buffers to map between Array args; - Map new_buffer_map; + for (const Var& param : func->params) { // Ideally all func params should have Buffers defined in the buffer_map // We should look to insert buffer_maps for all PrimFuncs that are returned // to the core compiler. if (func->buffer_map.find(param) != func->buffer_map.end()) { args.push_back(func->buffer_map[param]->data); - // Rewiring the buffer_var to map to Buffers for low-level passes - // retain information about the buffer. - new_buffer_map.Set(func->buffer_map[param]->data, func->buffer_map[param]); } else { args.push_back(param); } @@ -82,7 +79,7 @@ PrimFunc MakeUnpackedAPI(PrimFunc&& func) { func_ptr->body = MergeNest(device_init, func_ptr->body); func_ptr->params = args; func_ptr->ret_type = PrimType(DataType::Int(32)); - func_ptr->buffer_map = new_buffer_map; + func_ptr->buffer_map = Map(); // return the function. return std::move(func); From c89a8baeeb0e76eb67a38651cc8a5829195f9a6b Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Tue, 27 Sep 2022 16:11:14 -0700 Subject: [PATCH 266/704] [usmp] Also remap VarNode to USMP-allocated buffer (#12880) Before this patch, ConvertPoolAllocationsToOffsets would generate TIR like the following: let dense_let: Pointer(global int32) = @tir.address_of(global_workspace_37_buffer_var[69952], dtype=handle) for (k.outer: int32, 0, 64) { @tir.call_extern("gemm_1x1x1_update_UKVNAEBL", ..., dense, ...) } T_multiply[ax1] = @tir.q_multiply_shift(((dense: Buffer(dense_let, int32, [10], [], align=32)[ax1], ...) This caused CodegenSourceBase to later fail with this error: "src/target/source/codegen_source_base.cc", line 67 Check failed: (it != var_idmap_.end()) is false: Find undefined Variable dense After this patch, "dense" in the call_extern is changed to read "dense_let." --- src/tir/usmp/analysis/extract_buffer_info.cc | 20 ++-- .../convert_pool_allocations_to_offsets.cc | 10 ++ ...orm_convert_pool_allocations_to_offsets.py | 93 +++++++++++++++++++ 3 files changed, 114 insertions(+), 9 deletions(-) diff --git a/src/tir/usmp/analysis/extract_buffer_info.cc b/src/tir/usmp/analysis/extract_buffer_info.cc index 74d428f6dddf..268058945750 100644 --- a/src/tir/usmp/analysis/extract_buffer_info.cc +++ b/src/tir/usmp/analysis/extract_buffer_info.cc @@ -429,15 +429,17 @@ void BufferInfoExtractor::VisitExpr_(const VarNode* op) { Array static GetMatchedBuffers(const PrimFunc& func) { Array buffer_vars; - for (unsigned int i = 0; i < func->params.size() - 1; i++) { - Var param = func->params[i]; - buffer_vars.push_back(func->buffer_map[param]->data); - } - Var last_param = func->params.back(); - // Checks whether last var is present in the buffer map - // because it could be the resource handle - if (func->buffer_map.find(last_param) != func->buffer_map.end()) { - buffer_vars.push_back(func->buffer_map[last_param]->data); + if (func->params.size() > 0) { + for (unsigned int i = 0; i < func->params.size() - 1; i++) { + Var param = func->params[i]; + buffer_vars.push_back(func->buffer_map[param]->data); + } + Var last_param = func->params.back(); + // Checks whether last var is present in the buffer map + // because it could be the resource handle + if (func->buffer_map.find(last_param) != func->buffer_map.end()) { + buffer_vars.push_back(func->buffer_map[last_param]->data); + } } return buffer_vars; } diff --git a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc index 601e34719632..56aba654b59e 100644 --- a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc +++ b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc @@ -96,6 +96,7 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator { private: PrimExpr VisitExpr_(const CallNode* op) override; Stmt VisitStmt_(const AllocateNode* op) override; + PrimExpr VisitExpr_(const VarNode* op) override; PrimExpr VisitExpr_(const BufferLoadNode* op) override; Stmt VisitStmt_(const BufferStoreNode* op) override; @@ -395,6 +396,15 @@ PrimExpr PoolAllocationToOffsetConverter::VisitExpr_(const BufferLoadNode* op) { return std::move(load); } +PrimExpr PoolAllocationToOffsetConverter::VisitExpr_(const VarNode* op) { + auto it = allocate_var_to_let_var_.find(GetRef(op)); + if (it != allocate_var_to_let_var_.end()) { + return (*it).second; + } + + return StmtExprMutator::VisitExpr_(op); +} + Buffer PoolAllocationToOffsetConverter::GetRemappedBuffer(Buffer original) { { auto it = original_buf_to_let_buf_.find(original); diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py index fdda400a779f..31cc6e07dec3 100644 --- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py +++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py @@ -600,5 +600,98 @@ def test_resnet_subgraph(): tvm.ir.assert_structural_equal(actual_func, ref_func) +@tvm.script.ir_module +class TensorIntrinStructure: + @T.prim_func + def tensor_intrin_primfunc() -> None: + dense_data = T.allocate([10], "int32", "global") + T.evaluate( + T.call_extern( + "intrin_function", + T.tvm_access_ptr( + T.type_annotation(dtype="int32"), dense_data, 0, 1, 2, dtype="handle" + ), + dtype="int32", + ) + ) + + dense = T.buffer_decl([10], "int32", data=dense_data) + dense[0] = T.q_multiply_shift(dense[0], 1608879842, 31, -7, dtype="int32") + + @T.prim_func + def __tvm_main__(input: T.handle, output: T.handle) -> None: + T.evaluate(T.call_extern("tensor_intrin_primfunc", dtype="int32")) + + +@tvm.script.ir_module +class TensorIntrinStructurePlanned: + @T.prim_func + def tensor_intrin_primfunc(global_workspace_1_var: T.Ptr[T.uint8]) -> None: + global_workspace_1_buffer_var = T.match_buffer( + global_workspace_1_var, [40], dtype="uint8", strides=[1], elem_offset=0, align=16 + ) + T.preflattened_buffer( + global_workspace_1_buffer_var, [40], dtype="uint8", strides=[1], elem_offset=0, align=16 + ) + dense_let = T.buffer_decl([10], "int32") + with T.let(dense_let.data, T.address_of(global_workspace_1_buffer_var[0], dtype="handle")): + T.evaluate( + T.call_extern( + "intrin_function", + T.tvm_access_ptr( + T.type_annotation(dtype="int32"), dense_let.data, 0, 1, 2, dtype="handle" + ), + dtype="int32", + ) + ) + dense_let[0] = T.q_multiply_shift(dense_let[0], 1608879842, 31, -7, dtype="int32") + + @T.prim_func + def __tvm_main__( + input: T.handle, global_workspace_1_var: T.Ptr[T.uint8], output: T.handle + ) -> None: + global_workspace_1_buffer_var = T.match_buffer( + global_workspace_1_var, [40], dtype="uint8", strides=[1], elem_offset=0, align=16 + ) + T.evaluate( + T.call_extern( + "tensor_intrin_primfunc", global_workspace_1_buffer_var.data, dtype="int32" + ) + ) + + +def test_tensor_intrin(): + target = Target("c") + global_workspace_pool = WorkspacePoolInfo( + "global_workspace", + [target], + ) + + tir_mod = TensorIntrinStructure + tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target) + tir_mod = assign_poolinfos_to_allocates_in_irmodule(tir_mod, [global_workspace_pool]) + main_func = tir_mod["__tvm_main__"] + buffer_analysis = tvm.tir.usmp.analysis.extract_buffer_info(main_func, tir_mod) + buffer_info_map = buffer_analysis.buffer_info_stmts + + fcreate_array_bi = tvm.get_global_func("tir.usmp.CreateArrayBufferInfo") + buffer_info_arr = fcreate_array_bi(buffer_info_map) + fusmp_algo_greedy_by_size = tvm.get_global_func("tir.usmp.algo.greedy_by_size") + buffer_pool_allocations = fusmp_algo_greedy_by_size( + buffer_info_arr, buffer_analysis.memory_pressure + ) + fassign_stmt_pool_allocations = tvm.get_global_func("tir.usmp.AssignStmtPoolAllocations") + pool_allocations = fassign_stmt_pool_allocations(buffer_info_map, buffer_pool_allocations) + tir_mod_with_offsets = tvm.tir.usmp.transform.convert_pool_allocations_to_offsets( + pool_allocations, emit_tvmscript_printable=True + )(tir_mod) + + expected = TensorIntrinStructurePlanned + + for gv, ref_func in expected.functions.items(): + actual_func = tir_mod_with_offsets[gv.name_hint] + tvm.ir.assert_structural_equal(actual_func, ref_func) + + if __name__ == "__main__": pytest.main([__file__] + sys.argv[1:]) From 178f82dc481bf31961206412c22dd5519a245b49 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Tue, 27 Sep 2022 16:49:31 -0700 Subject: [PATCH 267/704] [TOPI] Implement Einsum with reduction axes (#12913) * [TOPI] Implement Einsum with reduction axes * address comments --- include/tvm/topi/einsum.h | 889 +------------------ src/relay/op/tensor/math.cc | 2 +- src/topi/einsum.cc | 353 ++++++++ src/topi/transform.cc | 4 - tests/python/topi/python/test_topi_einsum.py | 36 +- 5 files changed, 397 insertions(+), 887 deletions(-) create mode 100644 src/topi/einsum.cc diff --git a/include/tvm/topi/einsum.h b/include/tvm/topi/einsum.h index a0c4039909ad..5e7813f8431b 100644 --- a/include/tvm/topi/einsum.h +++ b/include/tvm/topi/einsum.h @@ -49,568 +49,6 @@ namespace topi { using namespace tvm::te; using namespace topi::detail; -/*! - * \brief Compute the stride of the given shape. - * - * \param shape for the operation. - * - * \return the stride of the shape. - */ -inline Array GetStride(const Array shape) { - size_t ndim = shape.size(); - int prod = 1; - Array stride = Array(ndim, -1); - for (int i = ndim - 1; i >= 0; i--) { - stride.Set(i, if_then_else(shape[i] > 1, prod, 0)); - prod = prod * GetConstInt(shape[i]); - } - return stride; -} - -/*! - * \brief Pad the shape with 1. - * - * \param shape the input shape to be padded - * \param odim the padding size of the objective shape. - * - * \return the padded shape. - */ -inline Array Pad(const Array shape, int odim) { - int ndim = shape.size(); - CHECK_GE(odim, ndim); - Array ret(static_cast(odim), 1); - for (int idim = 0; idim < ndim; ++idim) { - ret.Set(idim, shape[idim]); - } - return ret; -} - -/*! - * \brief Parse the subscripts for one operand into an output of 'ndim' labels. - * - * \param subscripts the subscripts for to be parsed. - * \param length subscripts[0: length] represents the current operand. - * \param ndim the ndim of current operand. - * \param iop the index of the operand. - * \param op_labels the parsing result. - * For Example: - * subscripts="abbcbc", ndim=6 -> op_labels=[97, 98, -1, 99, -3, -2]. - * subscripts="ab...bc", ndim=6 -> op_labels=[97, 98, 0, 0, -3, 99]. - * \param label_counts Count the number the label appears. - * \param min_label Save the minimal label according to ASCII. - * \param max_label Save the maximal label according to ASCII. - * - * \return 0. - */ -inline int ParseOperandSubscripts(const char* subscripts, int length, int ndim, int iop, - char* op_labels, char* label_counts, int* min_label, - int* max_label) { - int i; - int idim = 0; - int ellipsis = -1; - - /* Process all labels for this operand */ - for (i = 0; i < length; ++i) { - int label = subscripts[i]; - - /* A proper label for an axis. */ - if (label > 0 && isalpha(label)) { - /* Check we don't exceed the operator dimensions. */ - CHECK(idim < ndim) << "einstein sum subscripts string contains " - << "too many subscripts for operand " << iop; - - op_labels[idim++] = label; - if (label < *min_label) { - *min_label = label; - } - if (label > *max_label) { - *max_label = label; - } - label_counts[label]++; - } else if (label == '.') { - /* The beginning of the ellipsis. */ - /* Check it's a proper ellipsis. */ - CHECK( - !(ellipsis != -1 || i + 2 >= length || subscripts[++i] != '.' || subscripts[++i] != '.')) - << "einstein sum subscripts string contains a " - << "'.' that is not part of an ellipsis ('...') " - << "in operand " << iop; - - ellipsis = idim; - } else { - CHECK(label == ' ') << "invalid subscript '" << static_cast(label) - << "' in einstein sum " - << "subscripts string, subscripts must " - << "be letters"; - } - } - - /* No ellipsis found, labels must match dimensions exactly. */ - if (ellipsis == -1) { - CHECK(idim == ndim) << "operand has more dimensions than subscripts " - << "given in einstein sum, but no '...' ellipsis " - << "provided to broadcast the extra dimensions."; - } else if (idim < ndim) { - /* Ellipsis found, may have to add broadcast dimensions. */ - /* Move labels after ellipsis to the end. */ - for (i = 0; i < idim - ellipsis; ++i) { - op_labels[ndim - i - 1] = op_labels[idim - i - 1]; - } - /* Set all broadcast dimensions to zero. */ - for (i = 0; i < ndim - idim; ++i) { - op_labels[ellipsis + i] = 0; - } - } - - /* - * Find any labels duplicated for this operand, and turn them - * into negative offsets to the axis to merge with. - * - * In C, the char type may be signed or unsigned, but with - * twos complement arithmetic the char is ok either way here, and - * later where it matters the char is cast to a signed char. - */ - for (idim = 0; idim < ndim - 1; ++idim) { - int label = op_labels[idim]; - /* If it is a proper label, find any duplicates of it. */ - if (label > 0) { - /* Search for the next matching label. */ - char* next = reinterpret_cast(memchr(op_labels + idim + 1, label, ndim - idim - 1)); - - while (next != nullptr) { - /* The offset from next to op_labels[idim] (negative). */ - *next = static_cast((op_labels + idim) - next); - /* Search for the next matching label. */ - next = reinterpret_cast(memchr(next + 1, label, op_labels + ndim - 1 - next)); - } - } - } - return 0; -} - -/*! - * \brief Parse the subscripts for the output into an output that includes 'ndim_broadcast' - * unlabeled dimensions. - * - * \param subscripts the subscripts for to be parsed. - * \param length subscripts[0: length] represents the output operand. - * \param ndim_broadcast the broadcast dimension number. - * \param label_counts Count the number the label appears. - * \param out_labels similar to the op_labels in ParseOperandSubscripts, for each - * dimension, the ASCII code of the corresponding label. zero for the broadcasting dim. - * - * \return the total number of output dimensions or -1 if there is an error. - */ -inline int ParseOutputSubscripts(const char* subscripts, int length, int ndim_broadcast, - const char* label_counts, char* out_labels) { - int i, bdim; - int ndim = 0; - int ellipsis = 0; - - /* Process all the output labels. */ - for (i = 0; i < length; ++i) { - int label = subscripts[i]; - - /* A proper label for an axis. */ - if (label > 0 && isalpha(label)) { - /* Check that it doesn't occur again. */ - CHECK(memchr(subscripts + i + 1, label, length - i - 1) == nullptr) - << "einstein sum subscripts string includes " - << "output subscript '" << static_cast(label) << "' multiple times"; - - /* Check that it was used in the inputs. */ - CHECK(label_counts[label] != 0) - << "einstein sum subscripts string included " - << "output subscript '" << static_cast(label) << "' which never appeared " - << "in an input"; - - /* Check that there is room in out_labels for this label. */ - CHECK(ndim < NPY_MAXDIMS) << "einstein sum subscripts string contains " - << "too many subscripts in the output"; - - out_labels[ndim++] = label; - } else if (label == '.') { - /* The beginning of the ellipsis. */ - /* Check it is a proper ellipsis. */ - CHECK(!(ellipsis || i + 2 >= length || subscripts[++i] != '.' || subscripts[++i] != '.')) - << "einstein sum subscripts string " - << "contains a '.' that is not part of " - << "an ellipsis ('...') in the output"; - - /* Check there is room in out_labels for broadcast dims. */ - CHECK(ndim + ndim_broadcast <= NPY_MAXDIMS) << "einstein sum subscripts string contains " - << "too many subscripts in the output"; - - ellipsis = 1; - for (bdim = 0; bdim < ndim_broadcast; ++bdim) { - out_labels[ndim++] = 0; - } - } else { - CHECK(label == ' ') << "invalid subscript '" << static_cast(label) - << "' in einstein sum " - << "subscripts string, subscripts must " - << "be letters"; - } - } - - /* If no ellipsis was found there should be no broadcast dimensions. */ - CHECK(!(!ellipsis && ndim_broadcast > 0)) << "output has more dimensions than subscripts " - << "given in einstein sum, but no '...' ellipsis " - << "provided to broadcast the extra dimensions."; - - return ndim; -} - -/*! - * \brief If any dimensions are combined, create a view that combines them. - * Shows in newshape and newstride. - * - * \param op the operand tensor. - * \param iop the index of the operand. - * \param labels the op_labels fot the operand. Like [97, 98, -2] for "aba". - * \param newshape The combined shape. - * \param newstride The combined stride. - * - * For example: - * "aba -> ab", shape = [2,3,2] stride = [6,2,1] - * op_labels = [97, 98, -2], newshape = [2,3], newstride = [7,2] - */ -inline void GetCombinedDimsView(const Tensor& op, int iop, char* labels, Array* newshape, - Array* newstride) { - int idim, ndim, icombine, combineoffset; - int icombinemap[NPY_MAXDIMS]; - int newdim; - - Array shape = op->shape; - Array stride = GetStride(shape); - ndim = op.ndim(); - newdim = newshape->size(); - - /* Initialize the dimensions and strides to zero */ - for (idim = 0; idim < newdim; ++idim) { - newshape->Set(idim, 0); - newstride->Set(idim, 0); - } - - /* Copy the dimensions and strides, except when collapsing */ - icombine = 0; - for (idim = 0; idim < ndim; ++idim) { - /* - * The char type may be either signed or unsigned, we - * need it to be signed here. - */ - int label = (signed char)labels[idim]; - /* If this label says to merge axes, get the actual label */ - if (label < 0) { - combineoffset = label; - label = labels[idim + label]; - } else { - combineoffset = 0; - if (icombine != idim) { - labels[icombine] = labels[idim]; - } - icombinemap[idim] = icombine; - } - /* If the label is 0, it's an unlabeled broadcast dimension */ - if (label == 0) { - newshape->Set(icombine, shape[idim]); - newstride->Set(icombine, stride[idim]); - } else { - /* Update the combined axis dimensions and strides */ - int i = icombinemap[idim + combineoffset]; - CHECK(!((combineoffset < 0) && - GetConstInt((*newshape)[i] != 0 && (*newshape)[i] != shape[idim]))) - << "dimensions in operand " << iop << " for collapsing index '" << label - << "' don't match (" << GetConstInt((*newshape)[i]) << " != " << shape[idim] << ")"; - newshape->Set(i, shape[idim]); - newstride->Set(i, (*newstride)[i] + stride[idim]); - } - - /* If the label didn't say to combine axes, increment dest i */ - if (combineoffset == 0) { - icombine++; - } - } -} - -/*! - * \brief Prepare the operand axes to match each stride or shape pair. - * - * \param ndim the ndim of the operand tensor. - * \param iop the index of the operand. - * \param labels the op_labels fot the operand. [97, 98, -1, 99, -3, -2] for "abbcbc". - * \param axes The matched axes to be calculated. - * \param ndim_iter the dimension of iterating. Subscripts "ab, bc -> ac" ndim_iter = 3. - * \param iter_labels output_labels with the iterating label. ['a', 'c', 'b'] for the case above. - */ -inline static int PrepareOpAxes(int ndim, int iop, char* labels, int* axes, int ndim_iter, - char* iter_labels) { - int i, label, ibroadcast; - - ibroadcast = ndim - 1; - for (i = ndim_iter - 1; i >= 0; --i) { - label = iter_labels[i]; - /* - * If it's an unlabeled broadcast dimension, choose - * the next broadcast dimension from the operand. - */ - if (label == 0) { - while (ibroadcast >= 0 && labels[ibroadcast] != 0) { - --ibroadcast; - } - /* - * If we used up all the operand broadcast dimensions, - * extend it with a "newaxis" - */ - if (ibroadcast < 0) { - axes[i] = -1; - } else { - /* Otherwise map to the broadcast axis */ - axes[i] = ibroadcast; - --ibroadcast; - } - } else { - /* It's a labeled dimension, find the matching one */ - char* match = reinterpret_cast(memchr(labels, label, ndim)); - /* If the op doesn't have the label, broadcast it */ - if (match == nullptr) { - axes[i] = -1; - } else { - /* Otherwise use it */ - axes[i] = match - labels; - } - } - } - return 0; -} - -/*! - * \brief Count SubString. - * \param str the object string - * \param sub the pattern string - * - * \return number of substring - */ -inline int CountSubstring(const std::string& str, const std::string& sub) { - int count = 0; - std::string::size_type pos = 0; - while ((pos = str.find(sub, pos)) != std::string::npos) { - ++count; - pos += sub.length(); - } - return count; -} - -/*! - * \brief Transfer string to. - * \param str input string. - * - * \return bitset. - */ -inline std::bitset Str2Set(const std::string& str) { - std::bitset ret; - for (const char& c : str) { - ret.set(static_cast(c)); - } - return ret; -} - -/*! - * \brief Split str according to substring. - * \param str input string. - * \param sub the split pattern string. - * - * \return vector contains the splited substring. - */ -inline std::vector Split(const std::string& str, const std::string& sub) { - std::string::size_type pos = 0; - std::string::size_type start = 0; - std::vector ret; - while ((pos = str.find(sub, start)) != std::string::npos) { - ret.push_back(str.substr(start, pos - start)); - start = pos + sub.length(); - } - ret.push_back(str.substr(start)); - return ret; -} - -/*! - * \brief Parse the input subscripts into a vector of strings. - * \param subscripts input subscripts. - * \param operands operand tensors. - * - * \return vector of strings, vector[0] represents the input part, vector[1] represents the output. - * if no output, the vector[1] is NULL. - * "ab, bc -> ac" => ["ab,bc", "ac"] - */ -inline std::tuple ParseEinsumInput( - std::string subscripts, const std::vector>& operands) { - const std::string einsum_symbols = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; - std::bitset einsum_symbols_set; - for (const char& c : einsum_symbols) { - einsum_symbols_set.set(c); - } - - CHECK_NE(operands.size(), 0U) << "No input operands"; - - auto end_pos = std::remove(subscripts.begin(), subscripts.end(), ' '); - subscripts.erase(end_pos, subscripts.end()); - - // Ensure all characters are valid - for (const char& c : subscripts) { - if (c == '.' || c == ',' || c == '-' || c == '>') { - continue; - } - CHECK(einsum_symbols_set.test(c)) << "Character " << c << " is not a valid symbol."; - } - - // Check for proper "->" - if (subscripts.find('-') != std::string::npos || subscripts.find('>') != std::string::npos) { - bool invalid = (std::count(subscripts.begin(), subscripts.end(), '-') > 1 || - std::count(subscripts.begin(), subscripts.end(), '>') > 1); - CHECK(!invalid && CountSubstring(subscripts, "->") == 1) - << "Subscripts can only contain one '->'."; - } - - // Parse ellipses - if (subscripts.find('.') != std::string::npos) { - std::string used = subscripts; - used.erase( - std::remove_if(used.begin(), used.end(), - [](const char& c) { return c == '.' || c == ',' || c == '-' || c == '>'; }), - used.end()); - - std::bitset used_set = Str2Set(used); - std::string ellipse_inds = ""; - for (const char& c : einsum_symbols) { - if (!used_set.test(static_cast(c))) { - ellipse_inds.append(1, c); - } - } - int longest = 0; - std::string input_tmp, output_sub; - std::vector split_subscripts; - bool out_sub; - - if (subscripts.find("->") != std::string::npos) { - std::vector tmp = Split(subscripts, "->"); - input_tmp = tmp[0]; - output_sub = tmp[1]; - split_subscripts = Split(input_tmp, ","); - out_sub = true; - } else { - split_subscripts = Split(subscripts, ","); - out_sub = false; - } - - size_t size_split_subscripts = split_subscripts.size(); - subscripts = ""; - for (size_t i = 0; i < size_split_subscripts; ++i) { - const std::string& sub = split_subscripts[i]; - if (sub.find('.') != std::string::npos) { - CHECK_EQ(std::count(sub.begin(), sub.end(), '.'), 3) << "Invalid Ellipses"; - CHECK_EQ(CountSubstring(sub, "..."), 1) << "Invalid Ellipses"; - - // Take into account numerical values - int ellipse_count = 0; - if (operands[i].size() == 0) { - ellipse_count = 0; - } else { - ellipse_count = std::max(operands[i].size(), static_cast(1)); - ellipse_count -= sub.length() - 3; - } - - if (ellipse_count > longest) { - longest = ellipse_count; - } - - CHECK_GE(ellipse_count, 0) << "Ellipses lengths do not match."; - if (ellipse_count == 0) { - split_subscripts[i].erase(sub.find("..."), 3); - } else { - std::string rep_inds = ellipse_inds.substr(ellipse_inds.length() - ellipse_count); - split_subscripts[i].replace(sub.find("..."), 3, rep_inds); - } - } - subscripts += split_subscripts[i]; - if (i + 1 < size_split_subscripts) { - subscripts += ","; - } - } - std::string out_ellipse; - if (longest == 0) { - out_ellipse = ""; - } else { - out_ellipse = ellipse_inds.substr(ellipse_inds.length() - longest); - } - - if (out_sub) { - output_sub.replace(output_sub.find("..."), 3, out_ellipse); - subscripts += "->" + output_sub; - } else { - // Special care for outputless ellipses - std::bitset out_ellipse_set = Str2Set(out_ellipse); - std::string tmp_subscripts = subscripts, output_subscript = ""; - size_t len_tmp_subscripts = tmp_subscripts.length(); - std::sort(tmp_subscripts.begin(), tmp_subscripts.end()); - for (size_t i = 0; i < len_tmp_subscripts; ++i) { - const char& c = tmp_subscripts[i]; - if (c == ',') { - continue; - } - CHECK(einsum_symbols_set.test(c)) << "Character " << c << " is not a valid symbol."; - if ((i == 0 || tmp_subscripts[i - 1] != c) && - (i == len_tmp_subscripts - 1 || tmp_subscripts[i + 1] != c) && - !out_ellipse_set.test(c)) { - output_subscript.append(1, c); - } - } - subscripts += "->" + out_ellipse + output_subscript; - } - } - - // Build output string if does not exist - std::tuple ret; - if (subscripts.find("->") != std::string::npos) { - std::vector tmp(2); - tmp = Split(subscripts, "->"); - ret = std::make_tuple(tmp[0], tmp[1]); - } else { - std::string first = subscripts; - std::string second = ""; - // Build output subscripts - std::string tmp_subscripts = subscripts; - size_t len_tmp_subscripts = tmp_subscripts.length(); - std::sort(tmp_subscripts.begin(), tmp_subscripts.end()); - for (size_t i = 0; i < len_tmp_subscripts; ++i) { - const char& c = tmp_subscripts[i]; - if (c == ',') { - continue; - } - CHECK(einsum_symbols_set.test(c)) << "Character " << c << " is not a valid symbol."; - if ((i == 0 || tmp_subscripts[i - 1] != c) && - (i == len_tmp_subscripts - 1 || tmp_subscripts[i + 1] != c)) { - second.append(1, c); - } - } - ret = std::make_tuple(first, second); - } - - // Make sure output subscripts are in the input - std::bitset input_subscripts_set = Str2Set(std::get<0>(ret)); - for (const char& c : std::get<1>(ret)) { - CHECK(input_subscripts_set.test(c)) - << "Output character " << c << " did not appear in the input"; - } - - // Make sure number operands is equivalent to the number of terms - CHECK_EQ(std::count(std::get<0>(ret).begin(), std::get<0>(ret).end(), ',') + 1, operands.size()) - << "Number of einsum subscripts must be equal to the " - << "number of operands."; - - return ret; -} - /*! * \brief Compute the shape of the output. * \param subscripts input subscripts. @@ -618,54 +56,8 @@ inline std::tuple ParseEinsumInput( * * \return the shape of the output. */ -inline Array NumpyEinsumShape(const std::string subscripts, - const std::vector>& operands) { - // Parsing - std::tuple parsed_subscripts = ParseEinsumInput(subscripts, operands); - - // Build a few useful list and sets - std::vector input_list = Split(std::get<0>(parsed_subscripts), ","); - size_t isize = input_list.size(); - - // Get length of each unique dimension and ensure all dimensions are correct - int dimension_dict[LABELRANGE]; - memset(dimension_dict, -1, sizeof(dimension_dict)); - for (size_t i = 0; i < isize; ++i) { - const std::string& term = input_list[i]; - const Array& sh = operands[i]; - CHECK_EQ(sh.size(), term.length()) - << "Einstein sum subscript " << input_list[i] << " does not contain the " - << "correct number of indices for operand " << i << "."; - size_t len_term = term.length(); - for (size_t j = 0; j < len_term; ++j) { - int64_t dim = GetConstInt(sh[j]); - const char& c = term[j]; - - if (dimension_dict[static_cast(c)] != -1) { - // For broadcasting cases we always want the largest dim size - if (dimension_dict[static_cast(c)] == 1) { - dimension_dict[static_cast(c)] = dim; - } - CHECK(dim == 1 || dim == dimension_dict[static_cast(c)]) - << "Size of label '" << c << "' for operand " << i << " (" - << dimension_dict[static_cast(c)] << ") does not match previous terms (" << dim - << ")."; - } else { - dimension_dict[static_cast(c)] = dim; - } - } - } - - // Get oshape - const std::string& output_str = std::get<1>(parsed_subscripts); - size_t odim = output_str.size(); - Array oshape(odim, -1); - for (size_t i = 0; i < odim; ++i) { - oshape.Set(i, dimension_dict[static_cast(output_str[i])]); - } - // Neglecting oshape assign check temporally - return oshape; -} +Array InferEinsumShape(const std::string& subscripts, + const std::vector>& operands); /*! * \brief Evaluates the Einstein summation convention on the operands. @@ -678,265 +70,26 @@ inline Array NumpyEinsumShape(const std::string subscripts, * * \return The calculation based on the Einstein summation convention. */ -inline Tensor einsum(const std::string& subscripts_str, const Array inputs, - std::string name = "T_einsum", std::string tag = kEinsum) { - bool back = false; - const char* subscripts = subscripts_str.data(); - const char* head = subscripts; - const int nop = inputs.size(); - - /* Step 1: Parse the subscripts string into label_counts and op_labels */ - int iop, idim, min_label = LABELRANGE - 1, max_label = 0; - char label_counts[LABELRANGE], op_labels[NPY_MAXARGS][NPY_MAXDIMS]; - memset(label_counts, 0, sizeof(label_counts)); - for (iop = 0; iop < nop; ++iop) { - int length = static_cast(strcspn(subscripts, ",-")); - - CHECK(!(iop == nop - 1 && subscripts[length] == ',')) - << "more operands provided to einstein sum function " - << "than specified in the subscripts string"; - CHECK(!(iop < nop - 1 && subscripts[length] != ',')) - << "fewer operands provided to einstein sum function " - << "than specified in the subscripts string"; - CHECK_EQ(ParseOperandSubscripts(subscripts, length, inputs[iop + back].ndim(), iop, - op_labels[iop], label_counts, &min_label, &max_label), - 0); - - /* Move subscripts to the start of the labels for the next op */ - subscripts += length; - - if (iop < nop - 1) { - CHECK_LT(subscripts - head, subscripts_str.length()) << "subscripts out of range"; - subscripts++; - } - } - /* - * Find the number of broadcast dimensions, which is the maximum - * number of labels == 0 in an op_labels array. +Tensor einsum(const std::string& subscripts_str, const Array inputs, + std::string name = "T_einsum", std::string tag = kEinsum); + +struct EinsumEquation { + /*! + * \brief Create EinsumEquation from a string. + * The result will be converted to the explicit mode of Einsum if it is in implicit mode. + * \return The created EinsumEquation. */ - int ndim_broadcast = 0; - for (iop = 0; iop < nop; ++iop) { - int count_zeros = 0; - int ndim; - char* labels = op_labels[iop]; - - ndim = inputs[iop + back].ndim(); - for (idim = 0; idim < ndim; ++idim) { - if (labels[idim] == 0) { - ++count_zeros; - } - } - - if (count_zeros > ndim_broadcast) { - ndim_broadcast = count_zeros; - } - } - - /* - * If there is no output signature, fill output_labels and ndim_output - * using each label that appeared once, in alphabetical order. - */ - int label, ndim_output; - char output_labels[NPY_MAXDIMS]; - if (subscripts[0] == '\0') { - /* If no output was specified, always broadcast left, as usual. */ - for (ndim_output = 0; ndim_output < ndim_broadcast; ++ndim_output) { - output_labels[ndim_output] = 0; - } - for (label = min_label; label <= max_label; ++label) { - if (label_counts[label] == 1) { - CHECK(ndim_output < NPY_MAXDIMS) << "einstein sum subscript string has too many " - << "distinct labels"; - output_labels[ndim_output++] = label; - } - } - } else { - CHECK(subscripts[0] == '-' && subscripts[1] == '>') << "einstein sum subscript string does not " - << "contain proper '->' output specified"; - subscripts += 2; - - /* Parse the output subscript string. */ - ndim_output = ParseOutputSubscripts(subscripts, strlen(subscripts), ndim_broadcast, - label_counts, output_labels); - CHECK_GE(ndim_output, 0); - } - - /* - * Step 2: - * Process all the input ops, combining dimensions into their - * diagonal where specified. - */ - std::vector> opshape(nop), opstride_true(nop); - for (iop = 0; iop < nop; ++iop) { - char* labels = op_labels[iop]; - int combine, ndim; - - ndim = inputs[iop + back].ndim(); - - /* - * Check whether any dimensions need to be combined - * - * The char type may be either signed or unsigned, we - * need it to be signed here. - */ - combine = 0; - for (idim = 0; idim < ndim; ++idim) { - if ((signed char)labels[idim] < 0) { - combine++; - } - } - /* If any dimensions are combined, create a view which combines them */ - if (combine) { - Array tshape(static_cast(ndim - combine), -1); - Array tstride(static_cast(ndim - combine), -1); - GetCombinedDimsView(inputs[iop + back], iop, labels, &tshape, &tstride); - opshape[iop] = tshape; - opstride_true[iop] = tstride; - } else { - /* No combining needed */ - opshape[iop] = inputs[iop + back]->shape; - opstride_true[iop] = GetStride(opshape[iop]); - } - } - /* - * Step 3: - * Set up the labels for the iterator (output + combined labels). - * Can just share the output_labels memory, because iter_labels - * is output_labels with some more labels appended. - */ - char* iter_labels = output_labels; - int ndim_iter = ndim_output; - for (label = min_label; label <= max_label; ++label) { - if (label_counts[label] > 0 && memchr(output_labels, label, ndim_output) == nullptr) { - CHECK(ndim_iter < NPY_MAXDIMS) << "too many subscripts in einsum"; - iter_labels[ndim_iter++] = label; - } - } - /* Step 4: Set up the op_axes for the iterator */ - Array itershape(static_cast(ndim_iter), -1); - std::vector> iterstride(nop + 1, - Array(static_cast(ndim_iter), 0)); - - // output_shape - std::vector> operands; - for (size_t i = 0; i < inputs.size(); i++) { - operands.push_back(inputs[i]->shape); - } - Array oshape = NumpyEinsumShape(subscripts_str, operands); - Array ostride_true = GetStride(oshape); - Array reduceshape; - std::vector> remainshape(nop); - int op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS]; - int* op_axes[NPY_MAXARGS]; - for (iop = 0; iop < nop; ++iop) { - op_axes[iop] = op_axes_arrays[iop]; - CHECK_GE(PrepareOpAxes(opshape[iop].size(), iop, op_labels[iop], op_axes[iop], ndim_iter, - iter_labels), - 0); - for (idim = 0; idim < ndim_iter; idim++) { - if (op_axes[iop][idim] != -1) { - iterstride[iop].Set(idim, opstride_true[iop][op_axes[iop][idim]]); - if (GetConstInt(itershape[idim]) != -1) { - if (GetConstInt(itershape[idim]) == 1) { - itershape.Set(idim, opshape[iop][op_axes[iop][idim]]); - } - } else { - itershape.Set(idim, opshape[iop][op_axes[iop][idim]]); - } - } - } - } - for (idim = 0; idim < ndim_output; ++idim) { - iterstride[nop].Set(idim, ostride_true[idim]); - } - reduceshape = Array(static_cast(ndim_iter - ndim_output), 0); - for (idim = ndim_output; idim < ndim_iter; ++idim) { - reduceshape.Set(idim - ndim_output, itershape[idim]); - } - for (iop = 0; iop < nop; iop++) { - Array rsh; - for (idim = 0; idim < ndim_iter; idim++) { - if (op_axes_arrays[iop][idim] == -1) { - rsh.push_back(GetConstInt(itershape[idim])); - } else { - if (GetConstInt(itershape[idim] != opshape[iop][op_axes_arrays[iop][idim]])) { - rsh.push_back(GetConstInt(itershape[idim])); - } - } - } - remainshape[iop] = Array(rsh.begin(), rsh.end()); - } - // exclude the 0-dim case - if (ndim_iter == 0) { - ndim_iter = 1; - } - itershape = Pad(itershape, ndim_iter); - for (iop = 0; iop <= nop; ++iop) { - iterstride[iop] = Pad(iterstride[iop], ndim_iter); - } - // oshape = Pad(oshape, ndim_iter); - reduceshape = Pad(reduceshape, ndim_iter); - for (iop = 0; iop < nop; ++iop) { - opshape[iop] = Pad(opshape[iop], ndim_iter); - remainshape[iop] = Pad(remainshape[iop], ndim_iter); - } - // ostride and rstride - Array> ostride; - Array> rstride; - - for (iop = 0; iop < nop; ++iop) { - Array otmp(static_cast(ndim_iter), 0); - Array rtmp(static_cast(ndim_iter), 0); - for (idim = 0; idim < ndim_iter; ++idim) { - otmp.Set(idim, idim < ndim_output ? iterstride[iop][idim] : 1); - rtmp.Set(idim, idim < ndim_iter - ndim_output ? iterstride[iop][idim + ndim_output] : 1); - } - ostride.push_back(otmp); - rstride.push_back(rtmp); - } - - // func: input indices => return cooresponding value - auto func = [inputs, oshape, ostride, reduceshape, ndim_iter, rstride, - nop](const Array& input_indices) -> PrimExpr { - for (int rdim = 0; rdim < ndim_iter; ++rdim) { - if (GetConstInt(reduceshape[rdim]) == 0) { - return 0; // - } - } - Array ridx = UnravelIndex(0, reduceshape); - - PrimExpr sum = 0; - bool rec_flag = false; - do { - PrimExpr tmp = 1; - for (int iop = 0; iop < nop; ++iop) { - if (iop != -1) { - PrimExpr k = 0; - - for (size_t i = 0; i < input_indices.size(); ++i) { - k += input_indices[i] * ostride[iop][i]; - } - for (size_t i = 0; i < ridx.size(); ++i) { - k += ridx[i] * rstride[iop][i]; - } - Array temp_indices = UnravelIndex(k, inputs[iop]->shape); - tmp = tmp * inputs[iop](temp_indices); - } - } - sum += tmp; - ridx.Set(ridx.size() - 1, ridx[ridx.size() - 1] + 1); - for (int i = static_cast(ridx.size() - 1); - (i > 0) && GetConstInt(ridx[i] >= reduceshape[i]); --i) { - ridx.Set(i, ridx[i] - reduceshape[i]); - ridx.Set(i - 1, ridx[i - 1] + 1); - } - rec_flag = GetConstInt(ridx[0] < reduceshape[0]); - } while (rec_flag); - return sum; - }; - - return compute(oshape, func, name, tag); -} + static EinsumEquation FromString(const std::string& equation); + using Label = char; + using Subscript = std::vector