From 6dada8025bcb641a44acf2899c08b1ca0100c10c Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Tue, 5 Apr 2022 15:30:09 -0500 Subject: [PATCH] [Hexagon] Generalized HexagonBuffer::CopyTo/CopyFrom (#10878) * [Hexagon] Generalized HexagonBuffer::CopyTo/CopyFrom This change operates on the allocation regions in a `HexagonBuffer`, rather than referencing the managed allocation owned by a buffer, handling copies between two sets of possibly discontiguous regions. This will be necessary to handle discontiguous buffers that cannot be statically planned at compile-time, such as user-initiated allocations, within a shared memory pool. Contiguous regions of memory are recognized and result in a single DMA call. --- src/runtime/hexagon/hexagon/hexagon_buffer.cc | 126 ++++++---- src/runtime/hexagon/hexagon/hexagon_buffer.h | 36 +++ tests/cpp/runtime/hexagon_buffer.cc | 231 +++++++++++++++++- 3 files changed, 344 insertions(+), 49 deletions(-) diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon/hexagon_buffer.cc index 31820f5770ba..b1de44df330c 100644 --- a/src/runtime/hexagon/hexagon/hexagon_buffer.cc +++ b/src/runtime/hexagon/hexagon/hexagon_buffer.cc @@ -205,73 +205,105 @@ void HexagonBuffer::SetStorageScope(Optional scope) { } } -void HexagonBuffer::CopyTo(void* data, size_t nbytes) const { - CHECK_LE(nbytes, TotalBytes()); - CHECK(managed_allocations_.size() && "CopyTo not supported on unmanaged `external` allocations"); +std::vector BufferSet::MemoryCopies(const BufferSet& dest, const BufferSet& src, + size_t bytes_to_copy) { + CHECK_LE(bytes_to_copy, src.TotalBytes()); + CHECK_LE(bytes_to_copy, dest.TotalBytes()); + + auto pointer_to = [](const BufferSet& buf, size_t region_i, size_t byte_i) -> void* { + void* region = buf.buffers[region_i]; + return static_cast(region) + byte_i; + }; + + size_t num_src_regions = (bytes_to_copy + src.region_size_bytes - 1) / src.region_size_bytes; + + // First, determine all copies that do not cross boundaries in + // either source or destination region. This requires two loops, as + // a single source region may overlap one or more destination + // regions, and vice versa. + std::vector micro_copies; + for (size_t src_i = 0; src_i < num_src_regions; src_i++) { + size_t src_region_begin = src_i * src.region_size_bytes; + size_t src_region_end = std::min((src_i + 1) * src.region_size_bytes, bytes_to_copy); + + size_t dest_i_begin = src_region_begin / dest.region_size_bytes; + size_t dest_i_end = (src_region_end - 1) / dest.region_size_bytes + 1; + for (size_t dest_i = dest_i_begin; dest_i < dest_i_end; dest_i++) { + size_t offset_begin = std::max(src_region_begin, dest_i * dest.region_size_bytes); + size_t offset_end = std::min(src_region_end, (dest_i + 1) * dest.region_size_bytes); + + size_t num_bytes = offset_end - offset_begin; + void* src_ptr = pointer_to(src, src_i, offset_begin % src.region_size_bytes); + void* dest_ptr = pointer_to(dest, dest_i, offset_begin % dest.region_size_bytes); + micro_copies.push_back(MemoryCopy(dest_ptr, src_ptr, num_bytes)); + } + } + + return micro_copies; +} + +std::vector MemoryCopy::MergeAdjacent(std::vector micro_copies) { + std::sort(micro_copies.begin(), micro_copies.end(), + [](const MemoryCopy& a, const MemoryCopy& b) { return a.src < b.src; }); - size_t copied = 0; - for (const auto& managed_alloc : managed_allocations_) { - size_t bytes_to_copy = std::min(nbytes - copied, managed_alloc->allocation_nbytes_); - if (bytes_to_copy == 0) break; + std::vector macro_copies; + for (const auto& copy : micro_copies) { + if (macro_copies.size() && macro_copies.back().IsDirectlyBefore(copy)) { + macro_copies.back().num_bytes += copy.num_bytes; + } else { + macro_copies.push_back(copy); + } + } - void* data_plus_copied = static_cast((static_cast(data) + copied)); - int status = hexagon_user_dma_1d_sync(data_plus_copied, managed_alloc->data_, bytes_to_copy); - CHECK_EQ(status, 0); + return macro_copies; +} - copied += bytes_to_copy; +void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& src, + size_t bytes_to_copy) { + // First, determine all copies that do not cross boundaries in + // either source or destination region. + auto micro_copies = BufferSet::MemoryCopies(dest, src, bytes_to_copy); + + // If regions are contiguously allocated, we can reduce the number + // of copies required by merging adjacent copies. + auto macro_copies = MemoryCopy::MergeAdjacent(std::move(micro_copies)); + + // Finally, do the memory copies. + for (const auto& copy : macro_copies) { + int error_code = hexagon_user_dma_1d_sync(copy.dest, copy.src, copy.num_bytes); + CHECK_EQ(error_code, 0); } } +void HexagonBuffer::CopyTo(void* data, size_t nbytes) const { + CHECK(managed_allocations_.size() && "CopyTo not supported on unmanaged `external` allocations"); + + BufferSet src(allocations_.data(), allocations_.size(), nbytes_per_allocation_); + BufferSet dest(&data, 1, nbytes); + + hexagon_buffer_copy_across_regions(dest, src, nbytes); +} + void HexagonBuffer::CopyFrom(void* data, size_t nbytes) { - CHECK_LE(nbytes, TotalBytes()); CHECK(managed_allocations_.size() && "CopyFrom not supported on unmanaged `external` allocations"); - size_t copied = 0; - for (const auto& managed_alloc : managed_allocations_) { - size_t bytes_to_copy = std::min(nbytes - copied, managed_alloc->allocation_nbytes_); - if (bytes_to_copy == 0) break; - - void* data_plus_copied = static_cast((static_cast(data) + copied)); - int status = hexagon_user_dma_1d_sync(managed_alloc->data_, data_plus_copied, bytes_to_copy); - CHECK_EQ(status, 0); + BufferSet src(&data, 1, nbytes); + BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_); - copied += bytes_to_copy; - } + hexagon_buffer_copy_across_regions(dest, src, nbytes); } void HexagonBuffer::CopyFrom(const HexagonBuffer& other, size_t nbytes) { - CHECK_LE(nbytes, TotalBytes()); - CHECK_LE(nbytes, other.TotalBytes()); CHECK(managed_allocations_.size() && "CopyFrom not supported on unmanaged `external` allocations"); CHECK(other.managed_allocations_.size() && "CopyFrom not supported on unmanaged `external` allocations"); - if (managed_allocations_.size() == other.managed_allocations_.size()) { - size_t copied = 0; - for (size_t i = 0; i < managed_allocations_.size(); ++i) { - const auto& this_alloc = managed_allocations_[i]; - const auto& other_alloc = other.managed_allocations_[i]; + BufferSet src(other.allocations_.data(), other.allocations_.size(), other.nbytes_per_allocation_); + BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_); - size_t bytes_to_copy = std::min(nbytes - copied, this_alloc->allocation_nbytes_); - if (bytes_to_copy == 0) break; - - CHECK_LE(other_alloc->allocation_nbytes_, this_alloc->allocation_nbytes_); - - int status = hexagon_user_dma_1d_sync(this_alloc->data_, other_alloc->data_, bytes_to_copy); - CHECK_EQ(status, 0); - - copied += bytes_to_copy; - } - } else if (managed_allocations_.size() == 1) { - return other.CopyTo(managed_allocations_[0]->data_, nbytes); - } else if (other.managed_allocations_.size() == 1) { - return CopyFrom(other.managed_allocations_[0]->data_, nbytes); - } else { - CHECK(false) << "To copy between Hexagon Buffers they must either have the same number of " - "dimensions or one of the Hexagon Buffers must have a single dimension."; - } + hexagon_buffer_copy_across_regions(dest, src, nbytes); } } // namespace hexagon diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.h b/src/runtime/hexagon/hexagon/hexagon_buffer.h index 99167f69cfca..fa069d7dc14c 100644 --- a/src/runtime/hexagon/hexagon/hexagon_buffer.h +++ b/src/runtime/hexagon/hexagon/hexagon_buffer.h @@ -171,6 +171,42 @@ class HexagonBuffer { StorageScope storage_scope_; }; +/*! \brief Structure used to track/coalesce memory copies */ +struct MemoryCopy { + static std::vector MergeAdjacent(std::vector micro_copies); + + MemoryCopy(void* dest, void* src, size_t num_bytes) + : dest(dest), src(src), num_bytes(num_bytes) {} + + bool IsDirectlyBefore(const MemoryCopy& other) { + void* src_end = static_cast(src) + num_bytes; + void* dest_end = static_cast(dest) + num_bytes; + return (src_end == other.src) && (dest_end == other.dest); + } + + void* dest; + void* src; + size_t num_bytes; +}; + +/*! + */ +struct BufferSet { + // Determine all copies that do not cross boundaries in either + // source or destination region. + static std::vector MemoryCopies(const BufferSet& dest, const BufferSet& src, + size_t bytes_to_copy); + + BufferSet(void* const* buffers, size_t num_regions, size_t region_size_bytes) + : buffers(buffers), num_regions(num_regions), region_size_bytes(region_size_bytes) {} + + size_t TotalBytes() const { return num_regions * region_size_bytes; } + + void* const* buffers; + size_t num_regions; + size_t region_size_bytes; +}; + } // namespace hexagon } // namespace runtime } // namespace tvm diff --git a/tests/cpp/runtime/hexagon_buffer.cc b/tests/cpp/runtime/hexagon_buffer.cc index 2bf86b126d98..5a93b688a59a 100644 --- a/tests/cpp/runtime/hexagon_buffer.cc +++ b/tests/cpp/runtime/hexagon_buffer.cc @@ -47,6 +47,224 @@ TEST(HexagonBuffer, invalid_scope) { EXPECT_THROW(HexagonBuffer hb(8 /* nbytes */, 8 /* alignment */, scope), InternalError); } +TEST(HexagonBuffer, micro_copies_corresponding_regions) { + auto ptr = [](auto val) { return reinterpret_cast(val); }; + + std::vector src_ptr{ptr(0), ptr(16)}; + BufferSet src(src_ptr.data(), src_ptr.size(), 16); + + std::vector dest_ptr{ptr(64), ptr(80)}; + BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16); + + auto micro_copies = BufferSet::MemoryCopies(dest, src, 32); + EXPECT_EQ(micro_copies.size(), 2); + for (size_t i = 0; i < micro_copies.size(); i++) { + EXPECT_EQ(micro_copies[i].src, ptr(16 * i)); + EXPECT_EQ(micro_copies[i].dest, ptr(64 + 16 * i)); + EXPECT_EQ(micro_copies[i].num_bytes, 16); + } +} + +TEST(HexagonBuffer, micro_copies_src_bigger) { + auto ptr = [](auto val) { return reinterpret_cast(val); }; + + std::vector src_ptr{ptr(0), ptr(16)}; + BufferSet src(src_ptr.data(), src_ptr.size(), 16); + + std::vector dest_ptr{ptr(64), ptr(72), ptr(80), ptr(88)}; + BufferSet dest(dest_ptr.data(), dest_ptr.size(), 8); + + auto micro_copies = BufferSet::MemoryCopies(dest, src, 32); + EXPECT_EQ(micro_copies.size(), 4); + for (size_t i = 0; i < micro_copies.size(); i++) { + EXPECT_EQ(micro_copies[i].src, ptr(8 * i)); + EXPECT_EQ(micro_copies[i].dest, ptr(64 + 8 * i)); + EXPECT_EQ(micro_copies[i].num_bytes, 8); + } +} + +TEST(HexagonBuffer, micro_copies_dest_bigger) { + auto ptr = [](auto val) { return reinterpret_cast(val); }; + + std::vector src_ptr{ptr(0), ptr(8), ptr(16), ptr(24)}; + BufferSet src(src_ptr.data(), src_ptr.size(), 8); + + std::vector dest_ptr{ptr(64), ptr(80)}; + BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16); + + auto micro_copies = BufferSet::MemoryCopies(dest, src, 32); + EXPECT_EQ(micro_copies.size(), 4); + for (size_t i = 0; i < micro_copies.size(); i++) { + EXPECT_EQ(micro_copies[i].src, ptr(8 * i)); + EXPECT_EQ(micro_copies[i].dest, ptr(64 + 8 * i)); + EXPECT_EQ(micro_copies[i].num_bytes, 8); + } +} + +TEST(HexagonBuffer, micro_copies_src_overlaps_dest_region) { + auto ptr = [](auto val) { return reinterpret_cast(val); }; + + std::vector src_ptr{ptr(0), ptr(16)}; + BufferSet src(src_ptr.data(), src_ptr.size(), 16); + + std::vector dest_ptr{ptr(64), ptr(76)}; + BufferSet dest(dest_ptr.data(), dest_ptr.size(), 12); + + auto micro_copies = BufferSet::MemoryCopies(dest, src, 24); + EXPECT_EQ(micro_copies.size(), 3); + + // First region of source, first region of dest + EXPECT_EQ(micro_copies[0].src, ptr(0)); + EXPECT_EQ(micro_copies[0].dest, ptr(64)); + EXPECT_EQ(micro_copies[0].num_bytes, 12); + + // First region of source, second region of dest + EXPECT_EQ(micro_copies[1].src, ptr(12)); + EXPECT_EQ(micro_copies[1].dest, ptr(76)); + EXPECT_EQ(micro_copies[1].num_bytes, 4); + + // Second region of source, second region of dest + EXPECT_EQ(micro_copies[2].src, ptr(16)); + EXPECT_EQ(micro_copies[2].dest, ptr(80)); + EXPECT_EQ(micro_copies[2].num_bytes, 8); +} + +TEST(HexagonBuffer, micro_copies_dest_overlaps_src_region) { + auto ptr = [](auto val) { return reinterpret_cast(val); }; + + std::vector src_ptr{ptr(0), ptr(12)}; + BufferSet src(src_ptr.data(), src_ptr.size(), 12); + + std::vector dest_ptr{ptr(64), ptr(80)}; + BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16); + + auto micro_copies = BufferSet::MemoryCopies(dest, src, 24); + EXPECT_EQ(micro_copies.size(), 3); + + // First region of source, first region of dest + EXPECT_EQ(micro_copies[0].src, ptr(0)); + EXPECT_EQ(micro_copies[0].dest, ptr(64)); + EXPECT_EQ(micro_copies[0].num_bytes, 12); + + // Second region of source, first region of dest + EXPECT_EQ(micro_copies[1].src, ptr(12)); + EXPECT_EQ(micro_copies[1].dest, ptr(76)); + EXPECT_EQ(micro_copies[1].num_bytes, 4); + + // Second region of source, second region of dest + EXPECT_EQ(micro_copies[2].src, ptr(16)); + EXPECT_EQ(micro_copies[2].dest, ptr(80)); + EXPECT_EQ(micro_copies[2].num_bytes, 8); +} + +TEST(HexagonBuffer, micro_copies_discontiguous_regions) { + auto ptr = [](auto val) { return reinterpret_cast(val); }; + + // Stride of 16, but only first 11 bytes in each region belong to + // this buffer. + std::vector src_ptr{ptr(0), ptr(16)}; + BufferSet src(src_ptr.data(), src_ptr.size(), 11); + + std::vector dest_ptr{ptr(64), ptr(80)}; + BufferSet dest(dest_ptr.data(), dest_ptr.size(), 13); + + auto micro_copies = BufferSet::MemoryCopies(dest, src, 16); + EXPECT_EQ(micro_copies.size(), 3); + + // First region of source, first region of dest + EXPECT_EQ(micro_copies[0].src, ptr(0)); + EXPECT_EQ(micro_copies[0].dest, ptr(64)); + EXPECT_EQ(micro_copies[0].num_bytes, 11); + + // Second region of source, first region of dest + EXPECT_EQ(micro_copies[1].src, ptr(16)); + EXPECT_EQ(micro_copies[1].dest, ptr(75)); + EXPECT_EQ(micro_copies[1].num_bytes, 2); + + // Second region of source, second region of dest + EXPECT_EQ(micro_copies[2].src, ptr(18)); + EXPECT_EQ(micro_copies[2].dest, ptr(80)); + EXPECT_EQ(micro_copies[2].num_bytes, 3); +} + +TEST(HexagonBuffer, micro_copies_invalid_size) { + auto ptr = [](auto val) { return reinterpret_cast(val); }; + + std::vector src_ptr{ptr(0), ptr(16)}; + std::vector dest_ptr{ptr(64), ptr(80)}; + + { + BufferSet src(src_ptr.data(), 1, 16); + BufferSet dest(dest_ptr.data(), 2, 16); + EXPECT_THROW(BufferSet::MemoryCopies(dest, src, 24), InternalError); + } + + { + BufferSet src(src_ptr.data(), 2, 16); + BufferSet dest(dest_ptr.data(), 1, 16); + EXPECT_THROW(BufferSet::MemoryCopies(dest, src, 24), InternalError); + } +} + +TEST(HexagonBuffer, macro_copies_adjacent_corresponding_regions_merged) { + auto ptr = [](auto val) { return reinterpret_cast(val); }; + + std::vector src_ptr{ptr(0), ptr(16)}; + BufferSet src(src_ptr.data(), src_ptr.size(), 16); + + std::vector dest_ptr{ptr(64), ptr(80)}; + BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16); + + auto micro_copies = BufferSet::MemoryCopies(dest, src, 32); + auto macro_copies = MemoryCopy::MergeAdjacent(std::move(micro_copies)); + + ASSERT_EQ(macro_copies.size(), 1); + EXPECT_EQ(macro_copies[0].src, ptr(0)); + EXPECT_EQ(macro_copies[0].dest, ptr(64)); + EXPECT_EQ(macro_copies[0].num_bytes, 32); +} + +TEST(HexagonBuffer, macro_copies_discontiguous_regions_not_merged) { + auto ptr = [](auto val) { return reinterpret_cast(val); }; + + std::vector src_ptr{ptr(0), ptr(16)}; + BufferSet src(src_ptr.data(), src_ptr.size(), 12); + + std::vector dest_ptr{ptr(64), ptr(80)}; + BufferSet dest(dest_ptr.data(), dest_ptr.size(), 12); + + auto micro_copies = BufferSet::MemoryCopies(dest, src, 24); + auto macro_copies = MemoryCopy::MergeAdjacent(std::move(micro_copies)); + + ASSERT_EQ(macro_copies.size(), 2); + + EXPECT_EQ(macro_copies[0].src, ptr(0)); + EXPECT_EQ(macro_copies[0].dest, ptr(64)); + EXPECT_EQ(macro_copies[0].num_bytes, 12); + + EXPECT_EQ(macro_copies[1].src, ptr(16)); + EXPECT_EQ(macro_copies[1].dest, ptr(80)); + EXPECT_EQ(macro_copies[1].num_bytes, 12); +} + +TEST(HexagonBuffer, macro_copies_overlapping_regions_merged) { + auto ptr = [](auto val) { return reinterpret_cast(val); }; + + std::vector src_ptr{ptr(0), ptr(12)}; + BufferSet src(src_ptr.data(), src_ptr.size(), 12); + + std::vector dest_ptr{ptr(64), ptr(80)}; + BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16); + + auto micro_copies = BufferSet::MemoryCopies(dest, src, 24); + auto macro_copies = MemoryCopy::MergeAdjacent(std::move(micro_copies)); + + ASSERT_EQ(macro_copies.size(), 1); + EXPECT_EQ(macro_copies[0].src, ptr(0)); + EXPECT_EQ(macro_copies[0].dest, ptr(64)); + EXPECT_EQ(macro_copies[0].num_bytes, 24); +} + TEST(HexagonBuffer, copy_from) { Optional scope("global"); HexagonBuffer hb(8 /* nbytes */, 8 /* alignment */, scope); @@ -202,8 +420,17 @@ TEST(HexagonBuffer, md_copy_from_nd) { Optional scope("global"); HexagonBuffer hb3d(3 /* ndim */, 4 /* nbytes */, 8 /* alignment */, scope); HexagonBuffer hb4d(4 /* ndim */, 3 /* nbytes */, 8 /* alignment */, scope); - EXPECT_THROW(hb3d.CopyFrom(hb4d, 12), InternalError); - EXPECT_THROW(hb4d.CopyFrom(hb3d, 12), InternalError); + + std::vector data{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + + hb3d.CopyFrom(data.data(), data.size()); + hb4d.CopyFrom(hb3d, data.size()); + + uint8_t** hb3d_ptr = static_cast(hb3d.GetPointer()); + uint8_t** hb4d_ptr = static_cast(hb4d.GetPointer()); + for (size_t i = 0; i < 12; i++) { + EXPECT_EQ(hb3d_ptr[i / 4][i % 4], hb4d_ptr[i / 3][i % 3]); + } } TEST(HexagonBuffer, copy_to) {