diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py index f2c294cfed1a5..db216e43e2d1a 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py @@ -90,6 +90,7 @@ def lower_ethosu(sch, args, const_dict, name="main"): mod = tvm.tir.transform.RemoveNoOp()(mod) mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod) mod = ethosu_passes.HoistAllocates()(mod) + mod = ethosu_passes.CopyComputeReordering()(mod) disable_storage_rewrite = curr_cfg.get("tir.disable_storage_rewrite", False) if not disable_storage_rewrite: mod = tvm.tir.transform.StorageRewrite()(mod) diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py index baadede08d668..e05eed9c4bd02 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py @@ -17,6 +17,7 @@ # pylint: disable=invalid-name, unused-argument, no-else-return, inconsistent-return-statements, too-many-nested-blocks """The TIR passes to be run on Arm(R) Ethos(TM)-U NPU TIR Compiler.""" from collections import namedtuple +from typing import Optional import numpy as np # type: ignore import tvm @@ -913,3 +914,26 @@ def HoistAllocates() -> tvm.IRModule: The new module with hoisted allocate nodes. """ return _ffi_api.HoistAllocates() + + +def CopyComputeReordering(max_copy_movements: Optional[int] = None) -> tvm.IRModule: + """ + Reorders copy and compute nodes in such a way that independent DMA copies, + and computes happen in parallel. + Copies to buffers with local scope are not reordered, indeed they copy LUT + into the SHRAM which already happens in parallel with copying weights into + the weights encoder. + + Parameters + ---------- + max_copy_movements: Optional[int] + The maximum number of movements allowed for a copy. + If None, the pass context option tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements + is used if provided, otherwise the default value will be 1. + + Returns + ------- + tvm.IRModule + The new module with copy and compute nodes reordered. + """ + return _ffi_api.CopyComputeReordering(max_copy_movements) diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc index 45161499f5be4..2b7b2b4741e67 100644 --- a/src/tir/contrib/ethosu/passes.cc +++ b/src/tir/contrib/ethosu/passes.cc @@ -27,7 +27,17 @@ #include #include +#include + namespace tvm { + +/*! + * \brief The maximum number of movements allowed for a copy in the CopyComputeReordering pass. + */ +constexpr const char* kCopyComputeReorderingMaxCopyMovements = + "tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements"; +TVM_REGISTER_PASS_CONFIG_OPTION(kCopyComputeReorderingMaxCopyMovements, Integer); + namespace tir { namespace contrib { namespace ethosu { @@ -110,6 +120,104 @@ tvm::transform::Pass HoistAllocates() { TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates); +/*! + * \brief Reorders copy and compute nodes in such a way that independent DMA copies, + * and computes happen in parallel. + * Copies to buffers with local scope are not reordered, indeed they copy LUT + * into the SHRAM which already happens in parallel with copying weights into + * the weights encoder. + */ +class CopyComputeReorderingMutator : public StmtExprMutator { + public: + explicit CopyComputeReorderingMutator(int max_copy_movements) + : _max_copy_movements{max_copy_movements} {} + + PrimFunc operator()(PrimFunc main_func) { + if (_max_copy_movements > 0) { + auto prim_func_node{main_func.CopyOnWrite()}; + prim_func_node->body = this->VisitStmt(main_func->body); + return GetRef(prim_func_node); + } + return main_func; + } + + private: + Stmt VisitStmt_(const SeqStmtNode* op) override { + if (op->size() <= 1) { + return StmtExprMutator::VisitStmt_(op); + } + + auto seq_stmt{GetRef(op)}; + std::vector new_seq(seq_stmt->size()); + std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin()); + + // Each copy statement to a buffer with global scope is moved up + // at most `_max_copy_movements` times. + for (size_t index = 0; index < new_seq.size(); ++index) { + if (stmt_is_global_copy(new_seq[index])) { + int lower = std::max(0, static_cast(index) - _max_copy_movements); + for (int i = index; i > lower && !stmt_is_copy(new_seq[i - 1]); --i) { + std::swap(new_seq[i - 1], new_seq[i]); + } + } + } + + auto seq_stmt_node{CopyOnWrite(op)}; + seq_stmt_node->seq = std::move(new_seq); + return Stmt{seq_stmt_node}; + } + + tvm::runtime::Array get_stmt_args(const Stmt& stmt) { + auto eval_node{stmt.as()}; + ICHECK(eval_node) << "Expected statement to be an evaluate node, but was " + << stmt->GetTypeKey(); + auto call_node{eval_node->value.as()}; + ICHECK(call_node) << "Expected expression to be a call node, but was " + << eval_node->value->GetTypeKey(); + return call_node->args; + } + + bool stmt_is_copy(const Stmt& stmt) { + auto args{get_stmt_args(stmt)}; + return args[0].as()->value == "ethosu_copy"; + } + + bool stmt_is_global_copy(const Stmt& stmt) { + auto args{get_stmt_args(stmt)}; + return args[0].as()->value == "ethosu_copy" && + args[3].as()->buffer.scope() == "global"; + } + + /*! The maximum number of movements allowed for a copy. */ + int _max_copy_movements; +}; + +/*! + * \brief A pass to reorder copy and compute nodes in such a way that independent DMA copies, + * and computes happen in parallel. + * + * \param max_copy_movements: The maximum number of movements allowed for a copy. + * If None, the pass context option tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements + * is used if provided, otherwise the default value will be 1. + * \return tvm::transform::Pass + */ +tvm::transform::Pass CopyComputeReordering(Optional max_copy_movements) { + auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) { + ICHECK(mod->GetGlobalVars().size() == 1 && mod->ContainGlobalVar("main")) + << "Expected a single primitive function called 'main'. Please run the " + "CopyComputeReordering " + "pass in conjunction with the LowerToTIR() pass."; + auto value = max_copy_movements.value_or( + ctx->GetConfig(kCopyComputeReorderingMaxCopyMovements, Integer(1)).value()); + return CopyComputeReorderingMutator(value)(f); + }; + return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0, + "tir.contrib.ethos-u.CopyComputeReordering", {}); +} + +TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.CopyComputeReordering") + .set_body_typed(CopyComputeReordering); + } // namespace ethosu } // namespace contrib } // namespace tir diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py index 5e4117e50f8e1..01545217beb48 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py +++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py @@ -91,10 +91,10 @@ def _get_ethosu_workspace_size( @pytest.mark.parametrize( "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping", [ - ("ethos-u55-256", 1067408, 14096), - ("ethos-u55-128", 1067408, 3968), - ("ethos-u55-64", 1067408, 3968), - ("ethos-u55-32", 1067392, 3952), + ("ethos-u55-256", 1067520, 14208), + ("ethos-u55-128", 1067520, 4080), + ("ethos-u55-64", 1067520, 4080), + ("ethos-u55-32", 1067504, 4064), ], ) def test_double_conv2d( @@ -161,10 +161,10 @@ def tf_graph(x): @pytest.mark.parametrize( "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping", [ - ("ethos-u55-256", 180096, 15008), - ("ethos-u55-128", 180096, 14240), - ("ethos-u55-64", 180096, 14240), - ("ethos-u55-32", 180096, 14240), + ("ethos-u55-256", 180288, 15200), + ("ethos-u55-128", 180288, 14432), + ("ethos-u55-64", 180288, 14432), + ("ethos-u55-32", 180272, 14416), ], ) def test_depthwise2d_conv2d_pooling( diff --git a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py new file mode 100644 index 0000000000000..eebaa3b816b42 --- /dev/null +++ b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py @@ -0,0 +1,472 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest + +pytest.importorskip("ethosu.vela") + +import tvm +from tvm.script import tir as T +from tvm.relay.backend.contrib.ethosu.tir.passes import CopyComputeReordering + +# fmt: off +@tvm.script.ir_module +class AllOperatorsWithWeights: + @T.prim_func + def main() -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([8192], "int8") + buffer2 = T.buffer_decl([128], "uint8") + buffer3 = T.buffer_decl([32], "uint8") + buffer4 = T.buffer_decl([112], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + buffer6 = T.buffer_decl([112], "uint8") + buffer7 = T.buffer_decl([32], "uint8") + buffer8 = T.buffer_decl([112], "uint8") + buffer9 = T.buffer_decl([32], "uint8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p1 = T.allocate([128], "uint8", "global") + p2 = T.allocate([112], "uint8", "global") + p3 = T.allocate([112], "uint8", "global") + p4 = T.allocate([32], "uint8", "global") + p5 = T.allocate([32], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + p7 = T.allocate([112], "uint8", "global") + p8 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) +# fmt: on + + +def test_all_operators_with_weights_max_copy_movements_0(): + test_mod = CopyComputeReordering(0)(AllOperatorsWithWeights) + reference_mod = AllOperatorsWithWeights + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_all_operators_with_weights_max_copy_movements_1(): + # fmt: off + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([8192], "int8") + buffer2 = T.buffer_decl([128], "uint8") + buffer3 = T.buffer_decl([32], "uint8") + buffer4 = T.buffer_decl([112], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + buffer6 = T.buffer_decl([112], "uint8") + buffer7 = T.buffer_decl([32], "uint8") + buffer8 = T.buffer_decl([112], "uint8") + buffer9 = T.buffer_decl([32], "uint8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p1 = T.allocate([128], "uint8", "global") + p2 = T.allocate([112], "uint8", "global") + p3 = T.allocate([112], "uint8", "global") + p4 = T.allocate([32], "uint8", "global") + p5 = T.allocate([32], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + p7 = T.allocate([112], "uint8", "global") + p8 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering(1)(AllOperatorsWithWeights) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_all_operators_with_weights_max_copy_movements_2(): + # fmt: off + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([8192], "int8") + buffer2 = T.buffer_decl([128], "uint8") + buffer3 = T.buffer_decl([32], "uint8") + buffer4 = T.buffer_decl([112], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + buffer6 = T.buffer_decl([112], "uint8") + buffer7 = T.buffer_decl([32], "uint8") + buffer8 = T.buffer_decl([112], "uint8") + buffer9 = T.buffer_decl([32], "uint8") + buffer10 = T.buffer_decl([2048], "int8") + # body + p1 = T.allocate([128], "uint8", "global") + p2 = T.allocate([112], "uint8", "global") + p3 = T.allocate([112], "uint8", "global") + p4 = T.allocate([32], "uint8", "global") + p5 = T.allocate([32], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + p7 = T.allocate([112], "uint8", "global") + p8 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering(2)(AllOperatorsWithWeights) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +# fmt: off +@tvm.script.ir_module +class AllOperatorsWithoutWeights: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([36], "int8") + buffer2 = T.buffer_decl([9], "int8") + # body + p1 = T.allocate([96], "int8", "global") + T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 4, 3, 3, 0, 4, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 12, 3, 1, "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "int8", 3, 1, 3, 3, 0, 1, buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 3, 1, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) +# fmt: on + + +@pytest.mark.parametrize("max_copy_movements", [0, 1, 2]) +def test_all_operators_without_weights(max_copy_movements): + test_mod = CopyComputeReordering(max_copy_movements)(AllOperatorsWithoutWeights) + reference_mod = AllOperatorsWithoutWeights + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +# fmt: off +@tvm.script.ir_module +class OperatorsWithAndWithoutWeights: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([97156], "int8") + buffer2 = T.buffer_decl([80], "uint8") + buffer3 = T.buffer_decl([64], "uint8") + buffer4 = T.buffer_decl([96], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + # body + p1 = T.allocate([390336], "int8", "global") + p2 = T.allocate([80], "uint8", "global") + p3 = T.allocate([64], "uint8", "global") + p4 = T.allocate([390336], "int8", "global") + p5 = T.allocate([96], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle")) +# fmt: on + + +def test_operators_with_and_without_weights_max_copy_movements_0(): + test_mod = CopyComputeReordering(0)(OperatorsWithAndWithoutWeights) + reference_mod = OperatorsWithAndWithoutWeights + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_operators_with_and_without_weights_max_copy_movements_1(): + # fmt: off + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([97156], "int8") + buffer2 = T.buffer_decl([80], "uint8") + buffer3 = T.buffer_decl([64], "uint8") + buffer4 = T.buffer_decl([96], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + # body + p1 = T.allocate([390336], "int8", "global") + p2 = T.allocate([80], "uint8", "global") + p3 = T.allocate([64], "uint8", "global") + p4 = T.allocate([390336], "int8", "global") + p5 = T.allocate([96], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering(1)(OperatorsWithAndWithoutWeights) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_operators_with_and_without_weights_max_copy_movements_2(): + # fmt: off + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([97156], "int8") + buffer2 = T.buffer_decl([80], "uint8") + buffer3 = T.buffer_decl([64], "uint8") + buffer4 = T.buffer_decl([96], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + # body + p1 = T.allocate([390336], "int8", "global") + p2 = T.allocate([80], "uint8", "global") + p3 = T.allocate([64], "uint8", "global") + p4 = T.allocate([390336], "int8", "global") + p5 = T.allocate([96], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering(2)(OperatorsWithAndWithoutWeights) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +# fmt: off +@tvm.script.ir_module +class CopyToBufferWithLocalScope: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([64], "uint8") + buffer2 = T.buffer_decl([48], "uint8") + buffer3 = T.buffer_decl([48], "uint8") + buffer4 = T.buffer_decl([256], "uint8") + buffer5 = T.buffer_decl([16], "uint8") + buffer6 = T.buffer_decl([48], "uint8") + buffer7 = T.buffer_decl([256], "uint8") + buffer8 = T.buffer_decl([64], "uint8") + # body + p1 = T.allocate([48], "uint8", "global") + p2 = T.allocate([48], "uint8", "global") + p3 = T.allocate([256], "int8", "local") + p4 = T.allocate([256], "int8", "global") + p5 = T.allocate([16], "uint8", "global") + p6 = T.allocate([48], "uint8", "global") + p7 = T.allocate([256], "int8", "local") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) +# fmt: on + + +def test_copy_to_buffer_with_local_scope_max_copy_movements_0(): + test_mod = CopyComputeReordering(0)(CopyToBufferWithLocalScope) + reference_mod = CopyToBufferWithLocalScope + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +@pytest.mark.parametrize("max_copy_movements", [1, 2]) +def test_copy_to_buffer_with_local_scope_max_copy_movements_n(max_copy_movements): + # fmt: off + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([64], "uint8") + buffer2 = T.buffer_decl([48], "uint8") + buffer3 = T.buffer_decl([48], "uint8") + buffer4 = T.buffer_decl([256], "uint8") + buffer5 = T.buffer_decl([16], "uint8") + buffer6 = T.buffer_decl([48], "uint8") + buffer7 = T.buffer_decl([256], "uint8") + buffer8 = T.buffer_decl([64], "uint8") + # body + p1 = T.allocate([48], "uint8", "global") + p2 = T.allocate([48], "uint8", "global") + p3 = T.allocate([256], "int8", "local") + p4 = T.allocate([256], "int8", "global") + p5 = T.allocate([16], "uint8", "global") + p6 = T.allocate([48], "uint8", "global") + p7 = T.allocate([256], "int8", "local") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering(max_copy_movements)(CopyToBufferWithLocalScope) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_multiple_prim_funcs(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def main(): + T.evaluate(0) + + @T.prim_func + def abc(): + T.evaluate(0) + # fmt: on + + err_rgx = ( + r"Expected a single primitive function called 'main'. " + r"Please run the CopyComputeReordering pass in conjunction with the LowerToTIR\(\) pass." + ) + with pytest.raises(tvm.TVMError, match=err_rgx): + CopyComputeReordering(1)(InputModule) + + +def test_no_main_prim_func(): + # fmt: off + @tvm.script.ir_module + class InputModule: + @T.prim_func + def abs(): + T.evaluate(0) + # fmt: on + + err_rgx = ( + r"Expected a single primitive function called 'main'. " + r"Please run the CopyComputeReordering pass in conjunction with the LowerToTIR\(\) pass." + ) + with pytest.raises(tvm.TVMError, match=err_rgx): + CopyComputeReordering(1)(InputModule) + + +def test_default_max_copy_movements(): + # fmt: off + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([97156], "int8") + buffer2 = T.buffer_decl([80], "uint8") + buffer3 = T.buffer_decl([64], "uint8") + buffer4 = T.buffer_decl([96], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + # body + p1 = T.allocate([390336], "int8", "global") + p2 = T.allocate([80], "uint8", "global") + p3 = T.allocate([64], "uint8", "global") + p4 = T.allocate([390336], "int8", "global") + p5 = T.allocate([96], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + test_mod = CopyComputeReordering()(OperatorsWithAndWithoutWeights) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +def test_pass_context_option_max_copy_movements(): + # fmt: off + @tvm.script.ir_module + class ReferenceModule: + @T.prim_func + def main() -> None: + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + buffer1 = T.buffer_decl([97156], "int8") + buffer2 = T.buffer_decl([80], "uint8") + buffer3 = T.buffer_decl([64], "uint8") + buffer4 = T.buffer_decl([96], "uint8") + buffer5 = T.buffer_decl([32], "uint8") + # body + p1 = T.allocate([390336], "int8", "global") + p2 = T.allocate([80], "uint8", "global") + p3 = T.allocate([64], "uint8", "global") + p4 = T.allocate([390336], "int8", "global") + p5 = T.allocate([96], "uint8", "global") + p6 = T.allocate([32], "uint8", "global") + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle")) + # fmt: on + + with tvm.transform.PassContext( + config={"tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements": 2} + ): + test_mod = CopyComputeReordering()(OperatorsWithAndWithoutWeights) + reference_mod = ReferenceModule + tvm.ir.assert_structural_equal(test_mod, reference_mod, True) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py index 92e6cd3e19cb8..15b719f33c3f9 100644 --- a/tests/python/contrib/test_ethosu/test_encode_constants.py +++ b/tests/python/contrib/test_ethosu/test_encode_constants.py @@ -37,33 +37,34 @@ class WeightStreamOnlyU55: def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer = T.buffer_decl([128], "uint8") - buffer_1 = T.buffer_decl([32], "uint8") - buffer_2 = T.buffer_decl([112], "uint8") - buffer_3 = T.buffer_decl([32], "uint8") - buffer_4 = T.buffer_decl([112], "uint8") - buffer_5 = T.buffer_decl([32], "uint8") - buffer_6 = T.buffer_decl([112], "uint8") - buffer_7 = T.buffer_decl([32], "uint8") + buffer1 = T.buffer_decl([128], "uint8") + buffer2 = T.buffer_decl([32], "uint8") + buffer3 = T.buffer_decl([112], "uint8") + buffer4 = T.buffer_decl([32], "uint8") + buffer5 = T.buffer_decl([112], "uint8") + buffer6 = T.buffer_decl([32], "uint8") + buffer7 = T.buffer_decl([112], "uint8") + buffer8 = T.buffer_decl([32], "uint8") T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) # body - p1_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) - p2_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - p1_global_1 = T.buffer_decl([112], dtype="uint8", data=p1_global.data) - p2_global_1 = T.buffer_decl([32], dtype="uint8", data=p2_global.data) - T.evaluate(T.call_extern("ethosu_copy", buffer[0], 128, p1_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 32, p2_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global[0], 128, T.int8(-1), T.int8(-1), 12, p2_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 112, p1_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, p2_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 112, p1_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, p2_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 112, p1_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, p2_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True}) + p4 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + buffer9 = T.buffer_decl([112], "uint8", data=p1.data) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 32, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 112, buffer9[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 32, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, T.int8(-1), T.int8(-1), 12, p4[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 112, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 32, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 112, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, T.int8(-1), T.int8(-1), 12, p4[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -74,37 +75,34 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) # buffer definition - buffer_encoded = T.buffer_decl([160], dtype="uint8") - buffer_encoded_1 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_2 = T.buffer_decl([160], dtype="uint8") - buffer_encoded_3 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_4 = T.buffer_decl([176], dtype="uint8") - buffer_encoded_5 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_6 = T.buffer_decl([160], dtype="uint8") - buffer_encoded_7 = T.buffer_decl([32], dtype="uint8") T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data) + buffer_encoded_1 = T.buffer_decl([160], dtype="uint8") + buffer_encoded_1_1 = T.buffer_decl([32], dtype="uint8") + buffer_encoded_2_1 = T.buffer_decl([160], dtype="uint8") + buffer_encoded_3_1 = T.buffer_decl([32], dtype="uint8") + buffer_encoded_4_1 = T.buffer_decl([176], dtype="uint8") + buffer_encoded_5_1 = T.buffer_decl([32], dtype="uint8") + buffer_encoded_6_1 = T.buffer_decl([160], dtype="uint8") + buffer_encoded_7_1 = T.buffer_decl([32], dtype="uint8") # body placeholder_global = T.allocate([176], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_global_1 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data) - placeholder_global_2 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data) - placeholder_global_3 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data) placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global_1 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data) - placeholder_d_global_2 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data) - placeholder_d_global_3 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 160, placeholder_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle")) + placeholder_global_2 = T.allocate([160], "uint8", "global", annotations={"disable_lower_builtin":True}) + placeholder_d_global_2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + placeholder_global_1 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 160, placeholder_global_1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1_1[0], 32, placeholder_d_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 160, placeholder_global_2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3_1[0], 32, placeholder_d_global_2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 80, placeholder_global_1[80], 80, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 160, placeholder_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_1[0], 16, placeholder_d_global_1[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 176, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 96, placeholder_global[96], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 160, placeholder_global_3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 80, placeholder_global_3[80], 80, 12, placeholder_d_global_3[0], 16, placeholder_d_global_3[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 176, placeholder_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5_1[0], 32, placeholder_d_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 160, placeholder_global_2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7_1[0], 32, placeholder_d_global_2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 96, placeholder_global[96], 80, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on @@ -172,19 +170,21 @@ class RereadWeightsU55: def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer = T.buffer_decl([304], "uint8") - buffer_1 = T.buffer_decl([80], "uint8") + buffer1 = T.buffer_decl([304], "uint8") + buffer2 = T.buffer_decl([80], "uint8") T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) # body - placeholder_global = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) - T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True}) + p4 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 304, T.int8(-1), T.int8(-1), 12, p2[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 304, T.int8(-1), T.int8(-1), 12, p4[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -195,20 +195,20 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) # buffer definition - placeholder_encoded = T.buffer_decl([368], dtype="uint8") - placeholder_encoded_1 = T.buffer_decl([96], dtype="uint8") T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data) + placeholder_encoded_1 = T.buffer_decl([368], "uint8") + placeholder_encoded_1_2 = T.buffer_decl([96], "uint8") # body placeholder_global = T.allocate([368], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_global_1 = T.buffer_decl([368], dtype="uint8", data=placeholder_global.data) placeholder_d_global = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global_1 = T.buffer_decl([96], dtype="uint8", data=placeholder_d_global.data) - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 368, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 96, placeholder_d_global[0], dtype="handle")) + placeholder_global_1 = T.allocate([368], "uint8", "global", annotations={"disable_lower_builtin":True}) + placeholder_d_global_1 = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 368, placeholder_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1_2[0], 96, placeholder_d_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 368, placeholder_global_1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1_2[0], 96, placeholder_d_global_1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 192, placeholder_global[192], 176, 12, placeholder_d_global[0], 48, placeholder_d_global[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 368, placeholder_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 96, placeholder_d_global_1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 192, placeholder_global_1[192], 176, 12, placeholder_d_global_1[0], 48, placeholder_d_global_1[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -374,35 +374,37 @@ class MixedReadU55: def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - buffer = T.buffer_decl([592], "uint8") - buffer_1 = T.buffer_decl([160], "uint8") - buffer_2 = T.buffer_decl([80], "uint8") - buffer_3 = T.buffer_decl([32], "uint8") - buffer_4 = T.buffer_decl([80], "uint8") - buffer_5 = T.buffer_decl([32], "uint8") - buffer_6 = T.buffer_decl([80], "uint8") - buffer_7 = T.buffer_decl([32], "uint8") - buffer_8 = T.buffer_decl([80], "uint8") - buffer_9 = T.buffer_decl([32], "uint8") + buffer1 = T.buffer_decl([80], "uint8") + buffer2 = T.buffer_decl([32], "uint8") + buffer3 = T.buffer_decl([80], "uint8") + buffer4 = T.buffer_decl([32], "uint8") + buffer5 = T.buffer_decl([80], "uint8") + buffer6 = T.buffer_decl([32], "uint8") + buffer7 = T.buffer_decl([80], "uint8") + buffer8 = T.buffer_decl([32], "uint8") + buffer9 = T.buffer_decl([592], "uint8") + buffer10 = T.buffer_decl([160], "uint8") T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) # body - ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) - placeholder_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 80, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 80, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 80, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_8[0], 80, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_9[0], 32, placeholder_d_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + p1 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + p4 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True}) + p5 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 80, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 32, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 592, T.int8(-1), T.int8(-1), 12, buffer10[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 80, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 80, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 32, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 80, T.int8(-1), T.int8(-1), 12, p5[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 80, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 32, p5[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 80, T.int8(-1), T.int8(-1), 12, p5[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None @@ -412,42 +414,37 @@ class MixedReadU65: def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None: # function attr dict T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data) + T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data) # buffer definition - buffer_encoded = T.buffer_decl([96], dtype="uint8") - buffer_encoded_1 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_2 = T.buffer_decl([96], dtype="uint8") - buffer_encoded_3 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_4 = T.buffer_decl([96], dtype="uint8") - buffer_encoded_5 = T.buffer_decl([32], dtype="uint8") - buffer_encoded_6 = T.buffer_decl([96], dtype="uint8") - buffer_encoded_7 = T.buffer_decl([32], dtype="uint8") - placeholder_encoded = T.buffer_decl([608], dtype="uint8") - placeholder_encoded_1 = T.buffer_decl([160], dtype="uint8") - T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data) - T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data) - # body - ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + buffer_encoded_1 = T.buffer_decl([96], dtype="uint8") + buffer_encoded_1_2 = T.buffer_decl([32], dtype="uint8") + placeholder_encoded_1 = T.buffer_decl([608], dtype="uint8") + placeholder_encoded_1_2 = T.buffer_decl([160], dtype="uint8") + buffer_encoded_2_1 = T.buffer_decl([96], dtype="uint8") + buffer_encoded_3_1 = T.buffer_decl([32], dtype="uint8") + buffer_encoded_4_1 = T.buffer_decl([96], dtype="uint8") + buffer_encoded_5_1 = T.buffer_decl([32], dtype="uint8") + buffer_encoded_6_1 = T.buffer_decl([96], dtype="uint8") + buffer_encoded_7_1 = T.buffer_decl([32], dtype="uint8") placeholder_global = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_global_1 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data) - placeholder_global_2 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data) - placeholder_global_3 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data) placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) - placeholder_d_global_1 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data) - placeholder_d_global_2 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data) - placeholder_d_global_3 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded[0], 304, placeholder_encoded[304], 304, 12, placeholder_encoded_1[0], 80, placeholder_encoded_1[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 96, placeholder_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle")) + ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True}) + placeholder_global_2 = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True}) + placeholder_d_global_2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 96, placeholder_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1_2[0], 32, placeholder_d_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded_1[0], 304, placeholder_encoded_1[304], 304, 12, placeholder_encoded_1_2[0], 80, placeholder_encoded_1_2[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 96, placeholder_global_2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3_1[0], 32, placeholder_d_global_2[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 48, placeholder_global[48], 48, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 96, placeholder_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 48, placeholder_global_1[48], 48, 12, placeholder_d_global_1[0], 16, placeholder_d_global_1[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 96, placeholder_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 96, placeholder_global_3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 48, placeholder_global_3[48], 48, 12, placeholder_d_global_3[0], 16, placeholder_d_global_3[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 96, placeholder_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5_1[0], 32, placeholder_d_global[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 96, placeholder_global_2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7_1[0], 32, placeholder_d_global_2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 48, placeholder_global[48], 48, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py index e04cabe79d2fe..f64263ca0623b 100644 --- a/tests/python/contrib/test_ethosu/test_networks.py +++ b/tests/python/contrib/test_ethosu/test_networks.py @@ -43,13 +43,13 @@ @pytest.mark.parametrize( "accel_type, model_url, workspace_size", [ - ("ethos-u65-256", MOBILENET_V1_URL, 1423344), - ("ethos-u65-256", MOBILENET_V2_URL, 2185584), - ("ethos-u55-256", MOBILENET_V1_URL, 1423344), - ("ethos-u55-256", MOBILENET_V2_URL, 2185584), - ("ethos-u55-128", MOBILENET_V2_URL, 2185584), - ("ethos-u55-64", MOBILENET_V2_URL, 2185584), - ("ethos-u55-32", MOBILENET_V2_URL, 2185584), + ("ethos-u65-256", MOBILENET_V1_URL, 1892704), + ("ethos-u65-256", MOBILENET_V2_URL, 2257984), + ("ethos-u55-256", MOBILENET_V1_URL, 1892704), + ("ethos-u55-256", MOBILENET_V2_URL, 2257984), + ("ethos-u55-128", MOBILENET_V2_URL, 2257984), + ("ethos-u55-64", MOBILENET_V2_URL, 2257984), + ("ethos-u55-32", MOBILENET_V2_URL, 2258000), ], ) def test_networks_without_usmp(accel_type, model_url, workspace_size): @@ -71,8 +71,8 @@ def test_networks_without_usmp(accel_type, model_url, workspace_size): @pytest.mark.parametrize( "accel_type, model_url, workspace_size", [ - ("ethos-u65-256", MOBILENET_V1_URL, 1205872), - ("ethos-u55-256", MOBILENET_V2_URL, 1507152), + ("ethos-u65-256", MOBILENET_V1_URL, 1206880), + ("ethos-u55-256", MOBILENET_V2_URL, 1509408), ], ) def test_networks_with_usmp(accel_type, model_url, workspace_size): diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py index 4f06695b25b10..932df71d24029 100644 --- a/tests/python/contrib/test_ethosu/test_replace_copy.py +++ b/tests/python/contrib/test_ethosu/test_replace_copy.py @@ -88,14 +88,14 @@ def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(409 T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 16], dtype="int8", data=ethosu_write_1.data) # body placeholder_global_unrolled_iter_0 = T.allocate([416], "uint8", "global", annotations={"disable_lower_builtin": True}) - placeholder_global_unrolled_iter_1 = T.buffer_decl([272], "uint8", data=placeholder_global_unrolled_iter_0.data) placeholder_d_global_unrolled_iter_0 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin": True}) - placeholder_d_global_unrolled_iter_1 = T.buffer_decl([64], dtype="uint8", data=placeholder_d_global_unrolled_iter_0.data) + placeholder_global_unrolled_iter_1 = T.allocate([272], "uint8", "global", annotations={"disable_lower_builtin": True}) + placeholder_d_global_unrolled_iter_1 = T.allocate([64], "uint8", "global", annotations={"disable_lower_builtin": True}) T.evaluate(T.call_extern("ethosu_copy", buffer[0], 416, placeholder_global_unrolled_iter_0[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 112, placeholder_d_global_unrolled_iter_0[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_0[0], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 272, placeholder_global_unrolled_iter_1[0], dtype="handle")) T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 64, placeholder_d_global_unrolled_iter_1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_0[0], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_1[0], 272, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_1[0], 64, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py index 8a83e769141da..4baea26e591ef 100644 --- a/tests/python/contrib/test_ethosu/test_scheduler.py +++ b/tests/python/contrib/test_ethosu/test_scheduler.py @@ -180,29 +180,27 @@ def test_schedule_cache_reads(): @tvm.script.ir_module class DiamondGraphTir: @T.prim_func - def main(input_buffer: T.Buffer[(301056,), "int8"], output_buffer: T.Buffer[(75264,), "int8"]) -> None: + def main(placeholder: T.Buffer[(301056,), "int8"], ethosu_write: T.Buffer[(75264,), "int8"]) -> None: T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) - T.preflattened_buffer(input_buffer, [1, 56, 56, 96], dtype='int8', data=input_buffer.data) - T.preflattened_buffer(output_buffer, [1, 56, 56, 24], dtype='int8', data=output_buffer.data) - - weight_buffer = T.buffer_decl([2608], "uint8") - bias_buffer = T.buffer_decl([240], "uint8") - weight_buffer2 = T.buffer_decl([736], "uint8") - bias_buffer2 = T.buffer_decl([240], "uint8") - - weight_global = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True}) - weight_global2 = T.buffer_decl([736], "uint8", data=weight_global.data) - bias_global = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True}) - featuremap_buffer = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin": True}) - featuremap_buffer2 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin": True}) - - T.evaluate(T.call_extern("ethosu_copy", weight_buffer[0], 2608, weight_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", bias_buffer[0], 240, bias_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, input_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global[0], 2608, T.int8(-1), T.int8(-1), 12, bias_global[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", weight_buffer2[0], 736, weight_global2[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_copy", bias_buffer2[0], 240, bias_global[0], dtype="handle")) - T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global2[0], 736, T.int8(-1), T.int8(-1), 12, bias_global[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) - T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, output_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle")) + T.preflattened_buffer(placeholder, [1, 56, 56, 96], dtype='int8', data=placeholder.data) + T.preflattened_buffer(ethosu_write, [1, 56, 56, 24], dtype='int8', data=ethosu_write.data) + buffer1 = T.buffer_decl([2608], "uint8") + buffer2 = T.buffer_decl([240], "uint8") + buffer3 = T.buffer_decl([736], "uint8") + buffer4 = T.buffer_decl([240], "uint8") + p1 = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True}) + p2 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True}) + p3 = T.allocate([736], "uint8", "global", annotations={"disable_lower_builtin":True}) + p4 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True}) + p5 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True}) + p6 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True}) + T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 2608, p1[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 240, p2[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 736, p3[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 240, p4[0], dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p1[0], 2608, T.int8(-1), T.int8(-1), 12, p2[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p3[0], 736, T.int8(-1), T.int8(-1), 12, p4[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle")) + T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0,T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle")) __tvm_meta__ = None # fmt: on @@ -218,7 +216,6 @@ def test_schedule_diamond_graph(): test_mod, _ = _lower_to_tir(func, copy_constants()) reference_mod = DiamondGraphTir - tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)