From 92c44e2f248f98b56c00a64e9b0663da4d7a0850 Mon Sep 17 00:00:00 2001 From: Matthew Barrett Date: Tue, 31 Aug 2021 11:42:55 +0100 Subject: [PATCH 1/2] [microNPU][5] Convert Proposals to te.Schedules Change-Id: I6771578f1007b8fea02e2dec7d0c797a6ef6aa5e --- .../tvm/contrib/ethosu/cascader/__init__.py | 1 + .../tvm/contrib/ethosu/cascader/scheduler.py | 232 ++++++++++++++++++ .../test_ethosu/cascader/test_scheduler.py | 42 ++++ 3 files changed, 275 insertions(+) create mode 100644 python/tvm/contrib/ethosu/cascader/scheduler.py create mode 100644 tests/python/contrib/test_ethosu/cascader/test_scheduler.py diff --git a/python/tvm/contrib/ethosu/cascader/__init__.py b/python/tvm/contrib/ethosu/cascader/__init__.py index f151fa9bc418..3ee350d008b4 100644 --- a/python/tvm/contrib/ethosu/cascader/__init__.py +++ b/python/tvm/contrib/ethosu/cascader/__init__.py @@ -36,4 +36,5 @@ from .device_config import EthosuDeviceConfig from .tensor_config import TensorConfigState, MemoryRegion, TensorConfig from .plan import Plan +from .scheduler import apply_proposal, cascade from .cascader_options import CascaderOptions diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py new file mode 100644 index 000000000000..55b84586e3f5 --- /dev/null +++ b/python/tvm/contrib/ethosu/cascader/scheduler.py @@ -0,0 +1,232 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Scheduler for cascader which converts Proposals into Schedules.""" +from typing import Tuple, List, Dict, DefaultDict +from collections import defaultdict +import numpy as np + +from tvm import te +from tvm import tir +from .cascader_options import CascaderOptions +from .graph import CascaderGraph, Part, Tensor, TESubgraph +from .tensor_config import MemoryRegion +from .proposal import Proposal +from .proposal_generator import generate_proposals +from .graph import create_cascader_graph +from .device_config import EthosuDeviceConfig + + +def tile_nd( + sch: te.Schedule, tensor: te.Tensor, tile: Tuple[int, ...] +) -> Tuple[List[tir.IterVar], List[tir.IterVar]]: + """Scheduling utility to perform N-dimensional tiling. + + Parameters + ---------- + sch : te.Schedule + The schedule to apply the tiling to. + tensor : te.Tensor + The tensor to apply the tiling to. + tile : Tuple[int, ...] + The N-dimensional tile size. + + Returns + ------- + outer_indices : List[tir.IterVar] + The outer iteration variables. + inner_indices : List[tir.IterVar] + The inner iteration variables. + + """ + outer_indices = [] + inner_indices = [] + for i, size in enumerate(tile): + outer, inner = sch[tensor].split(tensor.op.axis[i], size) + outer_indices.append(outer) + inner_indices.append(inner) + + sch[tensor].reorder(*outer_indices, *inner_indices) + return outer_indices, inner_indices + + +def stripe_part( + part: Part, stripe_shape: Tuple[int, ...], sch: te.Schedule +) -> Tuple[te.Stage, tir.IterVar]: + """Apply a striping schedule to the TE subgraph represented by a Part.""" + te_subgraph = part.subgraph + te_output_tensor = te_subgraph.output_tensor + outer_indices, _ = tile_nd(sch, te_output_tensor, stripe_shape) + g = sch.create_group( + outputs=te_output_tensor.op.input_tensors, + inputs=te_subgraph.input_tensors, + include_inputs=False, + ) + g.compute_at(sch[te_output_tensor], outer_indices[-1]) + for ax in outer_indices: + sch[te_output_tensor].unroll(ax) + + return sch[te_output_tensor], outer_indices[-1] + + +def cascade_part( + part: Part, stripe_stage: te.Stage, stripe_axis: tir.IterVar, sch: te.Schedule +) -> None: + """Schedule a Part into a cascade indicated by a stripe Stage.""" + te_subgraph = part.subgraph + g = sch.create_group( + outputs=te_subgraph.output_tensor, inputs=te_subgraph.input_tensors, include_inputs=False + ) + g.compute_at(stripe_stage, stripe_axis) + + +def update_readers(part: Part, readers: DefaultDict[te.Tensor, List[te.Tensor]]) -> None: + """Update a dictionary which stores the te.Tensors that need to be read in order to produce a given te.Tensor.""" + visited = set() + + def _visit(tensor): + if tensor is not visited and tensor not in part.subgraph.input_tensors: + visited.add(tensor) + for input_tensor in tensor.op.input_tensors: + readers[input_tensor].append(tensor) + _visit(input_tensor) + + _visit(part.subgraph.output_tensor) + + +def apply_proposal(proposal: Proposal, sch: te.Schedule) -> None: + """Apply a Proposal to a Schedule, converting all the Plans into TE scheduling instructions. + + Note that the Schedule is mutated in-place. + + Parameters + ---------- + proposal : Proposal + The Proposal to apply to the Schedule. + sch : te.Schedule + The Schedule to apply to Proposal to. + + """ + for plan in proposal.plans: + output_tensor_config = plan.output_config + output_tensor = output_tensor_config.tensor + output_part = output_tensor.producers[0] + if output_part.in_line: + continue + stripe_config = output_tensor_config.stripe_configs[0] + stripe_shape = [int(x) for x in stripe_config.shape] + stripe_stage, stripe_axis = stripe_part(output_part, stripe_shape, sch) + copy_te_tensors = [] + readers = defaultdict(list) + for part in plan.part_group: + if part != output_part: + cascade_part(part, stripe_stage, stripe_axis, sch) + + update_readers(part, readers) + for i, input_tensor in enumerate(part.input_tensors): + tensor_config = plan.tensor_configs[input_tensor] + if tensor_config.home_region != tensor_config.copy_region: + copy_te_tensors.append(part.subgraph.input_tensors[i]) + + for te_tensor in copy_te_tensors: + copy_stage = sch.cache_read(te_tensor, "global", readers[te_tensor]) + sch[copy_stage].compute_at(stripe_stage, stripe_axis) + + +def create_home_map( + graph: CascaderGraph, + io_region: MemoryRegion, + constant_region: MemoryRegion, + working_regions: List[MemoryRegion], +) -> Dict[Tensor, List[MemoryRegion]]: + """Create a map between Tensors and the MemoryRegions they can be homed in.""" + home_map = {} + for tensor in graph.tensor_order: + if tensor.is_constant: + home_map[tensor] = [constant_region] + elif tensor in graph.input_tensors or tensor in graph.output_tensors: + home_map[tensor] = [io_region] + else: + home_map[tensor] = working_regions + + return home_map + + +def choose_proposal(proposals: List[Proposal], cascade_region: MemoryRegion): + """Choose the best performing Proposal that doesn't overflow the cascade region.""" + proposal_choice = proposals[0] + for proposal in reversed(proposals): + if proposal.memory_usage < cascade_region.size: + proposal_choice = proposal + break + + return proposal_choice + + +def cascade( + sch: te.Schedule, + te_graph: TESubgraph, + const_dict: Dict[int, np.ndarray], + options: CascaderOptions, + io_region: MemoryRegion, + constant_region: MemoryRegion, + working_regions: List[MemoryRegion], + device_config: EthosuDeviceConfig, +) -> None: + """Schedule a Tensor Expression graph using the technique of 'cascading'. + + 'Cascading' is a technique whereby operations are split into smaller + dependent tiles ('stripes') which can then execute in an interleaved + fashion. This allows for operations to execute together rather than + sequentially which can reduce intermediate memory requirements and in + certain cases improve performance. + + For more detail on 'cascading' as well as how it is implemented, refer to + the RFC here: https://github.com/apache/tvm-rfcs/pull/37. + + Parameters + ---------- + sch : te.Schedule + The Schedule to apply the cascading to. + te_graph : TESubgraph + The Tensor Expression graph from which the Schedule was created. + const_dict : Dict[int, np.ndarray] + A dictionary mapping input index to constant data if that input is + to be a constant. + options : CascaderOptions + Configuration options for the cascading scheduler. + io_region : MemoryRegion + The MemoryRegion in which input/output tensors should reside. + constant_region : MemoryRegion + The MemoryRegion in which constants should reside. + working_regions : List[MemoryRegion] + The MemoryRegions in which intermediate working tensors can reside. The + cascading scheduler will select which MemoryRegion to per tensor. + device_config : EthosuDeviceConfig + Target device configuration. + + """ + assert options.cascade_region in working_regions + # First convert the Tensor Expression graph into a CascaderGraph + casc_graph = create_cascader_graph(te_graph, const_dict, device_config) + # Then create a mapping between Tensors and their possible memory homes + home_map = create_home_map(casc_graph, io_region, constant_region, working_regions) + # Generate Proposals for Pareto-optimal ways to cascade the CascaderGraph + proposals = generate_proposals(casc_graph, home_map, options) + # Select the best Proposal subject to the memory constraints + proposal_choice = choose_proposal(proposals, options.cascade_region) + # Apply the selected Proposal to the Tensor Expression Schedule + apply_proposal(proposal_choice, sch) diff --git a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py new file mode 100644 index 000000000000..311372fb1e47 --- /dev/null +++ b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest + +import tvm.contrib.ethosu.cascader as cs + + +def test_cascade(SRAM, FLASH, TwoConv2DWithSliceTE, TwoConv2DTE, MobileNetv1StartTE, MobileNetv1TE): + fixtures = [ + TwoConv2DTE, + TwoConv2DWithSliceTE, + MobileNetv1StartTE, + MobileNetv1TE, + ] + device_config = cs.EthosuDeviceConfig("ethos-u55-256") + for sch, te_graph, const_dict in fixtures: + options = cs.CascaderOptions( + cascade_region=SRAM, + max_proposals=64, + stripe_factors=4, + max_plan_size=10, + always_copy_size=1024, + ) + cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], device_config) + + +if __name__ == "__main__": + pytest.main([__file__]) From ce1e16cecafb0c73abaab1201280d89f4f028939 Mon Sep 17 00:00:00 2001 From: Matthew Barrett Date: Tue, 31 Aug 2021 16:44:14 +0100 Subject: [PATCH 2/2] [microNPU][6] Migrate codegen to use cascader Use the cascader as the default scheduler. Includes placeholder MemoryRegions until integration with the PoolInfos provided by a user. Change-Id: I9a8063ed56c1c5e926389dc3b217355a5ad0e3f0 --- .../tvm/contrib/ethosu/cascader/scheduler.py | 10 ++-- .../relay/backend/contrib/ethosu/codegen.py | 48 ++++++++++++++++++- .../backend/contrib/ethosu/tir/scheduler.py | 7 +-- 3 files changed, 57 insertions(+), 8 deletions(-) diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py index 55b84586e3f5..79d765a1dbf7 100644 --- a/python/tvm/contrib/ethosu/cascader/scheduler.py +++ b/python/tvm/contrib/ethosu/cascader/scheduler.py @@ -42,7 +42,7 @@ def tile_nd( tensor : te.Tensor The tensor to apply the tiling to. tile : Tuple[int, ...] - The N-dimensional tile size. + The N-dimensional tile size Returns ------- @@ -76,8 +76,8 @@ def stripe_part( include_inputs=False, ) g.compute_at(sch[te_output_tensor], outer_indices[-1]) - for ax in outer_indices: - sch[te_output_tensor].unroll(ax) + for axis in outer_indices: + sch[te_output_tensor].unroll(axis) return sch[te_output_tensor], outer_indices[-1] @@ -94,7 +94,9 @@ def cascade_part( def update_readers(part: Part, readers: DefaultDict[te.Tensor, List[te.Tensor]]) -> None: - """Update a dictionary which stores the te.Tensors that need to be read in order to produce a given te.Tensor.""" + """Update a dictionary which stores the te.Tensors that need to be read in order to produce a + given te.Tensor. + """ visited = set() def _visit(tensor): diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py index fe7bfc3fcdd0..b9d5db68fae7 100644 --- a/python/tvm/relay/backend/contrib/ethosu/codegen.py +++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py @@ -17,11 +17,13 @@ """Codegen for Arm(R) Ethos(TM)-U NPU""" from collections import defaultdict +from typing import List, Callable import tvm from tvm import relay from tvm import ir from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants +from tvm.contrib.ethosu.cascader import cascade, EthosuDeviceConfig, CascaderOptions, MemoryRegion from tvm.relay.backend.contrib.ethosu.legalize import LegalizeEthosU from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator from tvm.relay.backend.contrib.ethosu import util @@ -309,6 +311,47 @@ def constant_updater(expr, symbol): # pylint: disable=unused-argument return dict() +def _create_cascader( + options: CascaderOptions, + io_region: MemoryRegion, + constant_region: MemoryRegion, + working_regions: List[MemoryRegion], + device_config: EthosuDeviceConfig, +) -> Callable: + def _cascader(te_graph, const_dict, sch): + cascade( + sch, + te_graph, + const_dict, + options, + io_region, + constant_region, + working_regions, + device_config, + ) + + return _cascader + + +def _ethos_u55_cascader() -> Callable: + flash = MemoryRegion(name="FLASH", size=10 ** 7, read_bandwidth=4, write_bandwidth=4) + sram = MemoryRegion(name="SRAM", size=10 ** 6, read_bandwidth=16, write_bandwidth=16) + device_config = EthosuDeviceConfig(util.get_accelerator_config()) + cascader_options = CascaderOptions( + cascade_region=sram, + max_proposals=64, + stripe_factors=5, + max_plan_size=10, + always_copy_size=1024, + ) + return _create_cascader( + options=cascader_options, + io_region=sram, + constant_region=flash, + working_regions=[sram], + device_config=device_config, + ) + @tvm._ffi.register_func("relay.ext.ethos-u.relay_to_tir_func") def relay_to_tir_func(ext_func: relay.Function) -> tvm.tir.PrimFunc: """ @@ -336,7 +379,10 @@ def relay_to_tir_func(ext_func: relay.Function) -> tvm.tir.PrimFunc: # this should be a single intelligent and a composite scheduler # that can perform scheduling based on user inputs such as # scratch memory size. - tir_mod, const_dict = lower_to_tir(mod["main"], copy_constants()) + try: + tir_mod, const_dict = lower_to_tir(mod["main"], _ethos_u55_cascader()) + except AssertionError: + tir_mod, const_dict = lower_to_tir(mod["main"], copy_constants()) for param in const_dict.keys(): const_dict[param] = tvm.nd.array(const_dict[param]) diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py index dc458484ec16..3be6d37b40cd 100644 --- a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py +++ b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py @@ -253,9 +253,10 @@ def _detect_cache_read(stage): return False for stage in sch.stages: - if _detect_cache_read(stage): - fax = stage.fuse(*stage.op.axis) - stage.pragma(fax, "op", "ethosu_copy") + if stage.attach_type != 2: # Not inlined + if _detect_cache_read(stage): + fax = stage.fuse(*stage.op.axis) + stage.pragma(fax, "op", "ethosu_copy") def inline_no_ops(cached_func, sch):