Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

renderer_vulkan: Cleanup and improve barriers in caches #1865

Merged
merged 5 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 117 additions & 30 deletions src/video_core/buffer_cache/buffer_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,16 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo
const BufferId buffer_id = FindBuffer(address, num_bytes);
return &slot_buffers[buffer_id];
}();
const vk::BufferMemoryBarrier2 buf_barrier = {
const vk::BufferMemoryBarrier2 pre_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reads in the srcAccessMask are a no-op according to Vulkan spec https://stackoverflow.com/a/64757475 KhronosGroup/Vulkan-Docs#131

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm correct, so either need to guard for writes or make this eNone

.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
.buffer = buffer->Handle(),
.offset = buffer->Offset(address),
.size = num_bytes,
};
const vk::BufferMemoryBarrier2 post_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
Expand All @@ -303,9 +312,14 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &buf_barrier,
.pBufferMemoryBarriers = &pre_barrier,
});
cmdbuf.updateBuffer(buffer->Handle(), buffer->Offset(address), num_bytes, value);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &post_barrier,
});
cmdbuf.updateBuffer(buffer->Handle(), buf_barrier.offset, num_bytes, value);
}

std::pair<Buffer*, u32> BufferCache::ObtainHostUBO(std::span<const u32> data) {
Expand Down Expand Up @@ -496,21 +510,48 @@ void BufferCache::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id,
};
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
static constexpr vk::MemoryBarrier READ_BARRIER{
.srcAccessMask = vk::AccessFlagBits::eMemoryWrite,
.dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite,
const std::array pre_barriers = {
vk::BufferMemoryBarrier2{
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferRead,
.buffer = overlap.Handle(),
.offset = 0,
.size = overlap.SizeBytes(),
},
};
static constexpr vk::MemoryBarrier WRITE_BARRIER{
.srcAccessMask = vk::AccessFlagBits::eTransferWrite,
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
const std::array post_barriers = {
vk::BufferMemoryBarrier2{
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferRead,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryWrite,
.buffer = overlap.Handle(),
.offset = 0,
.size = overlap.SizeBytes(),
},
vk::BufferMemoryBarrier2{
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite,
.buffer = new_buffer.Handle(),
.offset = dst_base_offset,
.size = overlap.SizeBytes(),
},
};
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands,
vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion,
READ_BARRIER, {}, {});
cmdbuf.copyBuffer(overlap.buffer, new_buffer.buffer, copy);
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer,
vk::PipelineStageFlagBits::eAllCommands,
vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {});
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = pre_barriers.data(),
});
cmdbuf.copyBuffer(overlap.Handle(), new_buffer.Handle(), copy);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = static_cast<u32>(post_barriers.size()),
.pBufferMemoryBarriers = post_barriers.data(),
});
DeleteBuffer(overlap_id);
}

Expand Down Expand Up @@ -614,21 +655,35 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
}
scheduler.EndRendering();
const auto cmdbuf = scheduler.CommandBuffer();
static constexpr vk::MemoryBarrier READ_BARRIER{
.srcAccessMask = vk::AccessFlagBits::eMemoryWrite,
.dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite,
const vk::BufferMemoryBarrier2 pre_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
.buffer = buffer.Handle(),
.offset = 0,
.size = buffer.SizeBytes(),
};
static constexpr vk::MemoryBarrier WRITE_BARRIER{
.srcAccessMask = vk::AccessFlagBits::eTransferWrite,
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
const vk::BufferMemoryBarrier2 post_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite,
.buffer = buffer.Handle(),
.offset = 0,
.size = buffer.SizeBytes(),
};
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands,
vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion,
READ_BARRIER, {}, {});
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &pre_barrier,
});
cmdbuf.copyBuffer(src_buffer, buffer.buffer, copies);
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer,
vk::PipelineStageFlagBits::eAllCommands,
vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {});
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &post_barrier,
});
}

bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size) {
Expand Down Expand Up @@ -678,10 +733,42 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr,
}
if (!copies.empty()) {
scheduler.EndRendering();
image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits2::eTransferRead, {});
const vk::BufferMemoryBarrier2 pre_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
.buffer = buffer.Handle(),
.offset = max_offset - size,
.size = size,
};
const vk::BufferMemoryBarrier2 post_barrier = {
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead,
.buffer = buffer.Handle(),
.offset = max_offset - size,
.size = size,
};
auto barriers = image.GetBarriers(vk::ImageLayout::eTransferSrcOptimal,
vk::AccessFlagBits2::eTransferRead,
vk::PipelineStageFlagBits2::eTransfer, {});
const auto cmdbuf = scheduler.CommandBuffer();
cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, buffer.buffer,
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &pre_barrier,
.imageMemoryBarrierCount = static_cast<u32>(barriers.size()),
.pImageMemoryBarriers = barriers.data(),
});
cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, buffer.Handle(),
copies);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &post_barrier,
});
}
return true;
}
Expand Down
8 changes: 7 additions & 1 deletion src/video_core/renderer_vulkan/vk_rasterizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,12 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
push_data.AddOffset(binding.buffer, adjust);
buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned,
vsharp.GetSize() + adjust);
if (auto barrier =
vk_buffer->GetBarrier(desc.is_written ? vk::AccessFlagBits2::eShaderWrite
: vk::AccessFlagBits2::eShaderRead,
vk::PipelineStageFlagBits2::eAllCommands)) {
buffer_barriers.emplace_back(*barrier);
}
}

set_writes.push_back({
Expand Down Expand Up @@ -606,7 +612,7 @@ void Rasterizer::BindBuffers(const Shader::Info& stage, Shader::Backend::Binding
if (auto barrier =
vk_buffer->GetBarrier(desc.is_written ? vk::AccessFlagBits2::eShaderWrite
: vk::AccessFlagBits2::eShaderRead,
vk::PipelineStageFlagBits2::eComputeShader)) {
vk::PipelineStageFlagBits2::eAllCommands)) {
buffer_barriers.emplace_back(*barrier);
}
if (desc.is_written) {
Expand Down
45 changes: 38 additions & 7 deletions src/video_core/texture_cache/texture_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -542,31 +542,62 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule
sched_ptr->EndRendering();

const auto cmdbuf = sched_ptr->CommandBuffer();
image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite, {},
cmdbuf);

const VAddr image_addr = image.info.guest_address;
const size_t image_size = image.info.guest_size_bytes;
const auto [vk_buffer, buf_offset] =
buffer_cache.ObtainViewBuffer(image_addr, image_size, is_gpu_dirty);

// The obtained buffer may be written by a shader so we need to emit a barrier to prevent RAW
// hazard
if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead,
vk::PipelineStageFlagBits2::eTransfer)) {
const auto dependencies = vk::DependencyInfo{
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &barrier.value(),
};
cmdbuf.pipelineBarrier2(dependencies);
});
}

const auto [buffer, offset] = tile_manager.TryDetile(vk_buffer->Handle(), buf_offset, image);
const auto [buffer, offset] =
tile_manager.TryDetile(vk_buffer->Handle(), buf_offset, image.info);
for (auto& copy : image_copy) {
copy.bufferOffset += offset;
}

const vk::BufferMemoryBarrier2 pre_barrier{
.srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.srcAccessMask = vk::AccessFlagBits2::eMemoryWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
.dstAccessMask = vk::AccessFlagBits2::eTransferRead,
.buffer = buffer,
.offset = offset,
.size = image_size,
};
const vk::BufferMemoryBarrier2 post_barrier{
.srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
.srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
.dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
.dstAccessMask = vk::AccessFlagBits2::eMemoryRead | vk::AccessFlagBits2::eMemoryWrite,
.buffer = buffer,
.offset = offset,
.size = image_size,
};
const auto image_barriers =
image.GetBarriers(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits2::eTransferWrite,
vk::PipelineStageFlagBits2::eTransfer, {});
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &pre_barrier,
.imageMemoryBarrierCount = static_cast<u32>(image_barriers.size()),
.pImageMemoryBarriers = image_barriers.data(),
});
cmdbuf.copyBufferToImage(buffer, image.image, vk::ImageLayout::eTransferDstOptimal, image_copy);
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
.bufferMemoryBarrierCount = 1,
.pBufferMemoryBarriers = &post_barrier,
});
image.flags &= ~ImageFlagBits::Dirty;
}

Expand Down
59 changes: 24 additions & 35 deletions src/video_core/texture_cache/tile_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_shader_util.h"
#include "video_core/texture_cache/image_info.h"
#include "video_core/texture_cache/image_view.h"
#include "video_core/texture_cache/tile_manager.h"

Expand Down Expand Up @@ -82,10 +83,10 @@ static vk::Format DemoteImageFormatForDetiling(vk::Format format) {
return format;
}

const DetilerContext* TileManager::GetDetiler(const Image& image) const {
const auto format = DemoteImageFormatForDetiling(image.info.pixel_format);
const DetilerContext* TileManager::GetDetiler(const ImageInfo& info) const {
const auto format = DemoteImageFormatForDetiling(info.pixel_format);

switch (image.info.tiling_mode) {
switch (info.tiling_mode) {
case AmdGpu::TilingMode::Texture_MicroTiled:
switch (format) {
case vk::Format::eR8Uint:
Expand Down Expand Up @@ -254,23 +255,23 @@ void TileManager::FreeBuffer(ScratchBuffer buffer) {
}

std::pair<vk::Buffer, u32> TileManager::TryDetile(vk::Buffer in_buffer, u32 in_offset,
Image& image) {
if (!image.info.props.is_tiled) {
const ImageInfo& info) {
if (!info.props.is_tiled) {
return {in_buffer, in_offset};
}

const auto* detiler = GetDetiler(image);
const auto* detiler = GetDetiler(info);
if (!detiler) {
if (image.info.tiling_mode != AmdGpu::TilingMode::Texture_MacroTiled &&
image.info.tiling_mode != AmdGpu::TilingMode::Display_MacroTiled &&
image.info.tiling_mode != AmdGpu::TilingMode::Depth_MacroTiled) {
if (info.tiling_mode != AmdGpu::TilingMode::Texture_MacroTiled &&
info.tiling_mode != AmdGpu::TilingMode::Display_MacroTiled &&
info.tiling_mode != AmdGpu::TilingMode::Depth_MacroTiled) {
LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})",
vk::to_string(image.info.pixel_format), NameOf(image.info.tiling_mode));
vk::to_string(info.pixel_format), NameOf(info.tiling_mode));
}
return {in_buffer, in_offset};
}

const u32 image_size = image.info.guest_size_bytes;
const u32 image_size = info.guest_size_bytes;

// Prepare output buffer
auto out_buffer = AllocBuffer(image_size, true);
Expand Down Expand Up @@ -313,22 +314,21 @@ std::pair<vk::Buffer, u32> TileManager::TryDetile(vk::Buffer in_buffer, u32 in_o
set_writes);

DetilerParams params;
params.num_levels = image.info.resources.levels;
params.pitch0 = image.info.pitch >> (image.info.props.is_block ? 2u : 0u);
params.height = image.info.size.height;
if (image.info.tiling_mode == AmdGpu::TilingMode::Texture_Volume) {
ASSERT(image.info.resources.levels == 1);
ASSERT(image.info.num_bits >= 32);
const auto tiles_per_row = image.info.pitch / 8u;
const auto tiles_per_slice = tiles_per_row * ((image.info.size.height + 7u) / 8u);
params.num_levels = info.resources.levels;
params.pitch0 = info.pitch >> (info.props.is_block ? 2u : 0u);
params.height = info.size.height;
if (info.tiling_mode == AmdGpu::TilingMode::Texture_Volume) {
ASSERT(info.resources.levels == 1);
ASSERT(info.num_bits >= 32);
const auto tiles_per_row = info.pitch / 8u;
const auto tiles_per_slice = tiles_per_row * ((info.size.height + 7u) / 8u);
params.sizes[0] = tiles_per_row;
params.sizes[1] = tiles_per_slice;
} else {

ASSERT(image.info.resources.levels <= 14);
ASSERT(info.resources.levels <= 14);
std::memset(&params.sizes, 0, sizeof(params.sizes));
for (int m = 0; m < image.info.resources.levels; ++m) {
params.sizes[m] = image.info.mips_layout[m].size * image.info.resources.layers +
for (int m = 0; m < info.resources.levels; ++m) {
params.sizes[m] = info.mips_layout[m].size * info.resources.layers +
(m > 0 ? params.sizes[m - 1] : 0);
}
}
Expand All @@ -337,20 +337,9 @@ std::pair<vk::Buffer, u32> TileManager::TryDetile(vk::Buffer in_buffer, u32 in_o
&params);

ASSERT((image_size % 64) == 0);
const auto bpp = image.info.num_bits * (image.info.props.is_block ? 16u : 1u);
const auto bpp = info.num_bits * (info.props.is_block ? 16u : 1u);
const auto num_tiles = image_size / (64 * (bpp / 8));
cmdbuf.dispatch(num_tiles, 1, 1);

const vk::BufferMemoryBarrier post_barrier{
.srcAccessMask = vk::AccessFlagBits::eShaderWrite,
.dstAccessMask = vk::AccessFlagBits::eTransferRead,
.buffer = out_buffer.first,
.size = image_size,
};
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion,
{}, post_barrier, {});

return {out_buffer.first, 0};
}

Expand Down
Loading
Loading