From 3fb6cefb2f3e89e6bfaf0ef55bac57bd4de6e230 Mon Sep 17 00:00:00 2001 From: JMS55 <47158642+JMS55@users.noreply.github.com> Date: Wed, 23 Oct 2024 12:18:49 -0700 Subject: [PATCH] Meshlet fill cluster buffers rewritten (#15955) # Objective - Make the meshlet fill cluster buffers pass slightly faster - Address https://github.com/bevyengine/bevy/issues/15920 for meshlets - Added PreviousGlobalTransform as a required meshlet component to avoid extra archetype moves, slightly alleviating https://github.com/bevyengine/bevy/issues/14681 for meshlets - Enforce that MeshletPlugin::cluster_buffer_slots is not greater than 2^25 (glitches will occur otherwise). Technically this field controls post-lod/culling cluster count, and the issue is on pre-lod/culling cluster count, but it's still valid now, and in the future this will be more true. Needs to be merged after https://github.com/bevyengine/bevy/pull/15846 and https://github.com/bevyengine/bevy/pull/15886 ## Solution - Old pass dispatched a thread per cluster, and did a binary search over the instances to find which instance the cluster belongs to, and what meshlet index within the instance it is. - New pass dispatches a workgroup per instance, and has the workgroup loop over all meshlets in the instance in order to write out the cluster data. - Use a push constant instead of arrayLength to fix the linked bug - Remap 1d->2d dispatch for software raster only if actually needed to save on spawning excess workgroups ## Testing - Did you test these changes? If so, how? - Ran the meshlet example, and an example with 1041 instances of 32217 meshlets per instance. Profiled the second scene with nsight, went from 0.55ms -> 0.40ms. Small savings. We're pretty much VRAM bandwidth bound at this point. - How can other people (reviewers) test your changes? Is there anything specific they need to know? - Run the meshlet example ## Changelog (non-meshlets) - PreviousGlobalTransform now implements the Default trait --- .../bevy_pbr/src/meshlet/cull_clusters.wgsl | 6 +- .../src/meshlet/fill_cluster_buffers.wgsl | 56 ++++++++++--------- .../bevy_pbr/src/meshlet/instance_manager.rs | 48 +++++++++------- .../src/meshlet/meshlet_bindings.wgsl | 16 +++--- crates/bevy_pbr/src/meshlet/mod.rs | 11 +++- crates/bevy_pbr/src/meshlet/pipelines.rs | 9 ++- .../src/meshlet/remap_1d_to_2d_dispatch.wgsl | 9 ++- .../bevy_pbr/src/meshlet/resource_manager.rs | 22 +++++--- .../meshlet/visibility_buffer_raster_node.rs | 42 +++++++++++--- .../meshlet/visibility_buffer_resolve.wgsl | 2 +- crates/bevy_pbr/src/prepass/mod.rs | 2 +- 11 files changed, 140 insertions(+), 83 deletions(-) diff --git a/crates/bevy_pbr/src/meshlet/cull_clusters.wgsl b/crates/bevy_pbr/src/meshlet/cull_clusters.wgsl index 2b036dee02c56..47f6dbb04b6be 100644 --- a/crates/bevy_pbr/src/meshlet/cull_clusters.wgsl +++ b/crates/bevy_pbr/src/meshlet/cull_clusters.wgsl @@ -13,7 +13,7 @@ meshlet_software_raster_indirect_args, meshlet_hardware_raster_indirect_args, meshlet_raster_clusters, - meshlet_raster_cluster_rightmost_slot, + constants, MeshletBoundingSphere, } #import bevy_render::maths::affine3_to_square @@ -32,7 +32,7 @@ fn cull_clusters( ) { // Calculate the cluster ID for this thread let cluster_id = local_invocation_index + 128u * dot(workgroup_id, vec3(num_workgroups.x * num_workgroups.x, num_workgroups.x, 1u)); - if cluster_id >= arrayLength(&meshlet_cluster_meshlet_ids) { return; } + if cluster_id >= constants.scene_cluster_count { return; } #ifdef MESHLET_SECOND_CULLING_PASS if !cluster_is_second_pass_candidate(cluster_id) { return; } @@ -138,7 +138,7 @@ fn cull_clusters( } else { // Append this cluster to the list for hardware rasterization buffer_slot = atomicAdd(&meshlet_hardware_raster_indirect_args.instance_count, 1u); - buffer_slot = meshlet_raster_cluster_rightmost_slot - buffer_slot; + buffer_slot = constants.meshlet_raster_cluster_rightmost_slot - buffer_slot; } meshlet_raster_clusters[buffer_slot] = cluster_id; } diff --git a/crates/bevy_pbr/src/meshlet/fill_cluster_buffers.wgsl b/crates/bevy_pbr/src/meshlet/fill_cluster_buffers.wgsl index 04af6c4ad7091..db39ae2bcedb9 100644 --- a/crates/bevy_pbr/src/meshlet/fill_cluster_buffers.wgsl +++ b/crates/bevy_pbr/src/meshlet/fill_cluster_buffers.wgsl @@ -1,6 +1,7 @@ #import bevy_pbr::meshlet_bindings::{ - cluster_count, - meshlet_instance_meshlet_counts_prefix_sum, + scene_instance_count, + meshlet_global_cluster_count, + meshlet_instance_meshlet_counts, meshlet_instance_meshlet_slice_starts, meshlet_cluster_instance_ids, meshlet_cluster_meshlet_ids, @@ -8,37 +9,42 @@ /// Writes out instance_id and meshlet_id to the global buffers for each cluster in the scene. +var cluster_slice_start_workgroup: u32; + @compute -@workgroup_size(128, 1, 1) // 128 threads per workgroup, 1 cluster per thread +@workgroup_size(1024, 1, 1) // 1024 threads per workgroup, 1 instance per workgroup fn fill_cluster_buffers( @builtin(workgroup_id) workgroup_id: vec3, @builtin(num_workgroups) num_workgroups: vec3, @builtin(local_invocation_index) local_invocation_index: u32, ) { - // Calculate the cluster ID for this thread - let cluster_id = local_invocation_index + 128u * dot(workgroup_id, vec3(num_workgroups.x * num_workgroups.x, num_workgroups.x, 1u)); - if cluster_id >= cluster_count { return; } // TODO: Could be an arrayLength? - - // Binary search to find the instance this cluster belongs to - var left = 0u; - var right = arrayLength(&meshlet_instance_meshlet_counts_prefix_sum) - 1u; - while left <= right { - let mid = (left + right) / 2u; - if meshlet_instance_meshlet_counts_prefix_sum[mid] <= cluster_id { - left = mid + 1u; - } else { - right = mid - 1u; - } + // Calculate the instance ID for this workgroup + var instance_id = workgroup_id.x + (workgroup_id.y * num_workgroups.x); + if instance_id >= scene_instance_count { return; } + + let instance_meshlet_count = meshlet_instance_meshlet_counts[instance_id]; + let instance_meshlet_slice_start = meshlet_instance_meshlet_slice_starts[instance_id]; + + // Reserve cluster slots for the instance and broadcast to the workgroup + if local_invocation_index == 0u { + cluster_slice_start_workgroup = atomicAdd(&meshlet_global_cluster_count, instance_meshlet_count); } - let instance_id = right; + let cluster_slice_start = workgroupUniformLoad(&cluster_slice_start_workgroup); - // Find the meshlet ID for this cluster within the instance's MeshletMesh - let meshlet_id_local = cluster_id - meshlet_instance_meshlet_counts_prefix_sum[instance_id]; + // Loop enough times to write out all the meshlets for the instance given that each thread writes 1 meshlet in each iteration + for (var clusters_written = 0u; clusters_written < instance_meshlet_count; clusters_written += 1024u) { + // Calculate meshlet ID within this instance's MeshletMesh to process for this thread + let meshlet_id_local = clusters_written + local_invocation_index; + if meshlet_id_local >= instance_meshlet_count { return; } - // Find the overall meshlet ID in the global meshlet buffer - let meshlet_id = meshlet_id_local + meshlet_instance_meshlet_slice_starts[instance_id]; + // Find the overall cluster ID in the global cluster buffer + let cluster_id = cluster_slice_start + meshlet_id_local; - // Write results to buffers - meshlet_cluster_instance_ids[cluster_id] = instance_id; - meshlet_cluster_meshlet_ids[cluster_id] = meshlet_id; + // Find the overall meshlet ID in the global meshlet buffer + let meshlet_id = instance_meshlet_slice_start + meshlet_id_local; + + // Write results to buffers + meshlet_cluster_instance_ids[cluster_id] = instance_id; + meshlet_cluster_meshlet_ids[cluster_id] = meshlet_id; + } } diff --git a/crates/bevy_pbr/src/meshlet/instance_manager.rs b/crates/bevy_pbr/src/meshlet/instance_manager.rs index eec2f32d35790..549c74d8bc9d3 100644 --- a/crates/bevy_pbr/src/meshlet/instance_manager.rs +++ b/crates/bevy_pbr/src/meshlet/instance_manager.rs @@ -10,8 +10,9 @@ use bevy_ecs::{ query::Has, system::{Local, Query, Res, ResMut, Resource, SystemState}, }; -use bevy_render::sync_world::MainEntity; -use bevy_render::{render_resource::StorageBuffer, view::RenderLayers, MainWorld}; +use bevy_render::{ + render_resource::StorageBuffer, sync_world::MainEntity, view::RenderLayers, MainWorld, +}; use bevy_transform::components::GlobalTransform; use bevy_utils::{HashMap, HashSet}; use core::ops::{DerefMut, Range}; @@ -19,33 +20,36 @@ use core::ops::{DerefMut, Range}; /// Manages data for each entity with a [`MeshletMesh`]. #[derive(Resource)] pub struct InstanceManager { - /// Amount of clusters in the scene (sum of all meshlet counts across all instances) + /// Amount of instances in the scene. + pub scene_instance_count: u32, + /// Amount of clusters in the scene. pub scene_cluster_count: u32, - /// Per-instance [`MainEntity`], [`RenderLayers`], and [`NotShadowCaster`] + /// Per-instance [`MainEntity`], [`RenderLayers`], and [`NotShadowCaster`]. pub instances: Vec<(MainEntity, RenderLayers, bool)>, - /// Per-instance [`MeshUniform`] + /// Per-instance [`MeshUniform`]. pub instance_uniforms: StorageBuffer>, - /// Per-instance material ID + /// Per-instance material ID. pub instance_material_ids: StorageBuffer>, - /// Prefix-sum of meshlet counts per instance - pub instance_meshlet_counts_prefix_sum: StorageBuffer>, - /// Per-instance index to the start of the instance's slice of the meshlets buffer + /// Per-instance count of meshlets in the instance's [`MeshletMesh`]. + pub instance_meshlet_counts: StorageBuffer>, + /// Per-instance index to the start of the instance's slice of the meshlets buffer. pub instance_meshlet_slice_starts: StorageBuffer>, /// Per-view per-instance visibility bit. Used for [`RenderLayers`] and [`NotShadowCaster`] support. pub view_instance_visibility: EntityHashMap>>, - /// Next material ID available for a [`Material`] + /// Next material ID available for a [`Material`]. next_material_id: u32, - /// Map of [`Material`] to material ID + /// Map of [`Material`] to material ID. material_id_lookup: HashMap, - /// Set of material IDs used in the scene + /// Set of material IDs used in the scene. material_ids_present_in_scene: HashSet, } impl InstanceManager { pub fn new() -> Self { Self { + scene_instance_count: 0, scene_cluster_count: 0, instances: Vec::new(), @@ -59,9 +63,9 @@ impl InstanceManager { buffer.set_label(Some("meshlet_instance_material_ids")); buffer }, - instance_meshlet_counts_prefix_sum: { + instance_meshlet_counts: { let mut buffer = StorageBuffer::default(); - buffer.set_label(Some("meshlet_instance_meshlet_counts_prefix_sum")); + buffer.set_label(Some("meshlet_instance_meshlet_counts")); buffer }, instance_meshlet_slice_starts: { @@ -80,7 +84,7 @@ impl InstanceManager { #[allow(clippy::too_many_arguments)] pub fn add_instance( &mut self, - instance: Entity, + instance: MainEntity, meshlets_slice: Range, transform: &GlobalTransform, previous_transform: Option<&PreviousGlobalTransform>, @@ -108,20 +112,21 @@ impl InstanceManager { // Append instance data self.instances.push(( - instance.into(), + instance, render_layers.cloned().unwrap_or(RenderLayers::default()), not_shadow_caster, )); self.instance_uniforms.get_mut().push(mesh_uniform); self.instance_material_ids.get_mut().push(0); - self.instance_meshlet_counts_prefix_sum + self.instance_meshlet_counts .get_mut() - .push(self.scene_cluster_count); + .push(meshlets_slice.len() as u32); self.instance_meshlet_slice_starts .get_mut() .push(meshlets_slice.start); - self.scene_cluster_count += meshlets_slice.end - meshlets_slice.start; + self.scene_instance_count += 1; + self.scene_cluster_count += meshlets_slice.len() as u32; } /// Get the material ID for a [`crate::Material`]. @@ -140,12 +145,13 @@ impl InstanceManager { } pub fn reset(&mut self, entities: &Entities) { + self.scene_instance_count = 0; self.scene_cluster_count = 0; self.instances.clear(); self.instance_uniforms.get_mut().clear(); self.instance_material_ids.get_mut().clear(); - self.instance_meshlet_counts_prefix_sum.get_mut().clear(); + self.instance_meshlet_counts.get_mut().clear(); self.instance_meshlet_slice_starts.get_mut().clear(); self.view_instance_visibility .retain(|view_entity, _| entities.contains(*view_entity)); @@ -227,7 +233,7 @@ pub fn extract_meshlet_mesh_entities( // Add the instance's data to the instance manager instance_manager.add_instance( - instance, + instance.into(), meshlets_slice, transform, previous_transform, diff --git a/crates/bevy_pbr/src/meshlet/meshlet_bindings.wgsl b/crates/bevy_pbr/src/meshlet/meshlet_bindings.wgsl index 42b322d731a80..7af63d0e0fe83 100644 --- a/crates/bevy_pbr/src/meshlet/meshlet_bindings.wgsl +++ b/crates/bevy_pbr/src/meshlet/meshlet_bindings.wgsl @@ -51,15 +51,17 @@ struct DrawIndirectArgs { const CENTIMETERS_PER_METER = 100.0; #ifdef MESHLET_FILL_CLUSTER_BUFFERS_PASS -var cluster_count: u32; -@group(0) @binding(0) var meshlet_instance_meshlet_counts_prefix_sum: array; // Per entity instance +var scene_instance_count: u32; +@group(0) @binding(0) var meshlet_instance_meshlet_counts: array; // Per entity instance @group(0) @binding(1) var meshlet_instance_meshlet_slice_starts: array; // Per entity instance @group(0) @binding(2) var meshlet_cluster_instance_ids: array; // Per cluster @group(0) @binding(3) var meshlet_cluster_meshlet_ids: array; // Per cluster +@group(0) @binding(4) var meshlet_global_cluster_count: atomic; // Single object shared between all workgroups #endif #ifdef MESHLET_CULLING_PASS -var meshlet_raster_cluster_rightmost_slot: u32; +struct Constants { scene_cluster_count: u32, meshlet_raster_cluster_rightmost_slot: u32 } +var constants: Constants; @group(0) @binding(0) var meshlet_cluster_meshlet_ids: array; // Per cluster @group(0) @binding(1) var meshlet_bounding_spheres: array; // Per meshlet @group(0) @binding(2) var meshlet_simplification_errors: array; // Per meshlet @@ -67,9 +69,9 @@ var meshlet_raster_cluster_rightmost_slot: u32; @group(0) @binding(4) var meshlet_instance_uniforms: array; // Per entity instance @group(0) @binding(5) var meshlet_view_instance_visibility: array; // 1 bit per entity instance, packed as a bitmask @group(0) @binding(6) var meshlet_second_pass_candidates: array>; // 1 bit per cluster , packed as a bitmask -@group(0) @binding(7) var meshlet_software_raster_indirect_args: DispatchIndirectArgs; // Single object shared between all workgroups/clusters/triangles -@group(0) @binding(8) var meshlet_hardware_raster_indirect_args: DrawIndirectArgs; // Single object shared between all workgroups/clusters/triangles -@group(0) @binding(9) var meshlet_raster_clusters: array; // Single object shared between all workgroups/clusters/triangles +@group(0) @binding(7) var meshlet_software_raster_indirect_args: DispatchIndirectArgs; // Single object shared between all workgroups +@group(0) @binding(8) var meshlet_hardware_raster_indirect_args: DrawIndirectArgs; // Single object shared between all workgroups +@group(0) @binding(9) var meshlet_raster_clusters: array; // Single object shared between all workgroups @group(0) @binding(10) var depth_pyramid: texture_2d; // From the end of the last frame for the first culling pass, and from the first raster pass for the second culling pass @group(0) @binding(11) var view: View; @group(0) @binding(12) var previous_view: PreviousViewUniforms; @@ -95,7 +97,7 @@ fn cluster_is_second_pass_candidate(cluster_id: u32) -> bool { @group(0) @binding(3) var meshlet_vertex_positions: array; // Many per meshlet @group(0) @binding(4) var meshlet_cluster_instance_ids: array; // Per cluster @group(0) @binding(5) var meshlet_instance_uniforms: array; // Per entity instance -@group(0) @binding(6) var meshlet_raster_clusters: array; // Single object shared between all workgroups/clusters/triangles +@group(0) @binding(6) var meshlet_raster_clusters: array; // Single object shared between all workgroups @group(0) @binding(7) var meshlet_software_raster_cluster_count: u32; #ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT @group(0) @binding(8) var meshlet_visibility_buffer: array>; // Per pixel diff --git a/crates/bevy_pbr/src/meshlet/mod.rs b/crates/bevy_pbr/src/meshlet/mod.rs index 1deb4ae85c762..f8a5d46747fe2 100644 --- a/crates/bevy_pbr/src/meshlet/mod.rs +++ b/crates/bevy_pbr/src/meshlet/mod.rs @@ -56,7 +56,7 @@ use self::{ }, visibility_buffer_raster_node::MeshletVisibilityBufferRasterPassNode, }; -use crate::{graph::NodePbr, Material, MeshMaterial3d}; +use crate::{graph::NodePbr, Material, MeshMaterial3d, PreviousGlobalTransform}; use bevy_app::{App, Plugin, PostUpdate}; use bevy_asset::{load_internal_asset, AssetApp, AssetId, Handle}; use bevy_core_pipeline::{ @@ -129,6 +129,8 @@ pub struct MeshletPlugin { /// If this number is too low, you'll see rendering artifacts like missing or blinking meshes. /// /// Each cluster slot costs 4 bytes of VRAM. + /// + /// Must not be greater than 2^25. pub cluster_buffer_slots: u32, } @@ -147,6 +149,11 @@ impl Plugin for MeshletPlugin { #[cfg(target_endian = "big")] compile_error!("MeshletPlugin is only supported on little-endian processors."); + if self.cluster_buffer_slots > 2_u32.pow(25) { + error!("MeshletPlugin::cluster_buffer_slots must not be greater than 2^25."); + std::process::exit(1); + } + load_internal_asset!( app, MESHLET_BINDINGS_SHADER_HANDLE, @@ -293,7 +300,7 @@ impl Plugin for MeshletPlugin { /// The meshlet mesh equivalent of [`bevy_render::mesh::Mesh3d`]. #[derive(Component, Clone, Debug, Default, Deref, DerefMut, Reflect, PartialEq, Eq, From)] #[reflect(Component, Default)] -#[require(Transform, Visibility)] +#[require(Transform, PreviousGlobalTransform, Visibility)] pub struct MeshletMesh3d(pub Handle); impl From for AssetId { diff --git a/crates/bevy_pbr/src/meshlet/pipelines.rs b/crates/bevy_pbr/src/meshlet/pipelines.rs index e02ad5f5aced3..0ee25bde6a0d0 100644 --- a/crates/bevy_pbr/src/meshlet/pipelines.rs +++ b/crates/bevy_pbr/src/meshlet/pipelines.rs @@ -84,7 +84,7 @@ impl FromWorld for MeshletPipelines { layout: vec![cull_layout.clone()], push_constant_ranges: vec![PushConstantRange { stages: ShaderStages::COMPUTE, - range: 0..4, + range: 0..8, }], shader: MESHLET_CULLING_SHADER_HANDLE, shader_defs: vec![ @@ -99,7 +99,7 @@ impl FromWorld for MeshletPipelines { layout: vec![cull_layout], push_constant_ranges: vec![PushConstantRange { stages: ShaderStages::COMPUTE, - range: 0..4, + range: 0..8, }], shader: MESHLET_CULLING_SHADER_HANDLE, shader_defs: vec![ @@ -441,7 +441,10 @@ impl FromWorld for MeshletPipelines { pipeline_cache.queue_compute_pipeline(ComputePipelineDescriptor { label: Some("meshlet_remap_1d_to_2d_dispatch_pipeline".into()), layout: vec![layout], - push_constant_ranges: vec![], + push_constant_ranges: vec![PushConstantRange { + stages: ShaderStages::COMPUTE, + range: 0..4, + }], shader: MESHLET_REMAP_1D_TO_2D_DISPATCH_SHADER_HANDLE, shader_defs: vec![], entry_point: "remap_dispatch".into(), diff --git a/crates/bevy_pbr/src/meshlet/remap_1d_to_2d_dispatch.wgsl b/crates/bevy_pbr/src/meshlet/remap_1d_to_2d_dispatch.wgsl index 6ade11b1d87e6..fc984436347bd 100644 --- a/crates/bevy_pbr/src/meshlet/remap_1d_to_2d_dispatch.wgsl +++ b/crates/bevy_pbr/src/meshlet/remap_1d_to_2d_dispatch.wgsl @@ -8,13 +8,16 @@ struct DispatchIndirectArgs { @group(0) @binding(0) var meshlet_software_raster_indirect_args: DispatchIndirectArgs; @group(0) @binding(1) var meshlet_software_raster_cluster_count: u32; +var max_compute_workgroups_per_dimension: u32; @compute @workgroup_size(1, 1, 1) fn remap_dispatch() { meshlet_software_raster_cluster_count = meshlet_software_raster_indirect_args.x; - let n = u32(ceil(sqrt(f32(meshlet_software_raster_indirect_args.x)))); - meshlet_software_raster_indirect_args.x = n; - meshlet_software_raster_indirect_args.y = n; + if meshlet_software_raster_cluster_count > max_compute_workgroups_per_dimension { + let n = u32(ceil(sqrt(f32(meshlet_software_raster_cluster_count)))); + meshlet_software_raster_indirect_args.x = n; + meshlet_software_raster_indirect_args.y = n; + } } diff --git a/crates/bevy_pbr/src/meshlet/resource_manager.rs b/crates/bevy_pbr/src/meshlet/resource_manager.rs index edfc7ba5f3da6..79473b2c36fe1 100644 --- a/crates/bevy_pbr/src/meshlet/resource_manager.rs +++ b/crates/bevy_pbr/src/meshlet/resource_manager.rs @@ -122,6 +122,7 @@ impl ResourceManager { storage_buffer_read_only_sized(false, None), storage_buffer_sized(false, None), storage_buffer_sized(false, None), + storage_buffer_sized(false, None), ), ), ), @@ -246,6 +247,7 @@ impl ResourceManager { #[derive(Component)] pub struct MeshletViewResources { + pub scene_instance_count: u32, pub scene_cluster_count: u32, pub second_pass_candidates_buffer: Buffer, instance_visibility: Buffer, @@ -330,7 +332,7 @@ pub fn prepare_meshlet_per_frame_resources( &render_queue, ); upload_storage_buffer( - &mut instance_manager.instance_meshlet_counts_prefix_sum, + &mut instance_manager.instance_meshlet_counts, &render_device, &render_queue, ); @@ -340,9 +342,6 @@ pub fn prepare_meshlet_per_frame_resources( &render_queue, ); - // Early submission for GPU data uploads to start while the render graph records commands - render_queue.submit([]); - let needed_buffer_size = 4 * instance_manager.scene_cluster_count as u64; match &mut resource_manager.cluster_instance_ids { Some(buffer) if buffer.size() >= needed_buffer_size => buffer.clone(), @@ -553,6 +552,7 @@ pub fn prepare_meshlet_per_frame_resources( }; commands.entity(view_entity).insert(MeshletViewResources { + scene_instance_count: instance_manager.scene_instance_count, scene_cluster_count: instance_manager.scene_cluster_count, second_pass_candidates_buffer, instance_visibility, @@ -602,19 +602,25 @@ pub fn prepare_meshlet_view_bind_groups( let first_node = Arc::new(AtomicBool::new(true)); + let fill_cluster_buffers_global_cluster_count = + render_device.create_buffer(&BufferDescriptor { + label: Some("meshlet_fill_cluster_buffers_global_cluster_count"), + size: 4, + usage: BufferUsages::STORAGE, + mapped_at_creation: false, + }); + // TODO: Some of these bind groups can be reused across multiple views for (view_entity, view_resources) in &views { let entries = BindGroupEntries::sequential(( - instance_manager - .instance_meshlet_counts_prefix_sum - .binding() - .unwrap(), + instance_manager.instance_meshlet_counts.binding().unwrap(), instance_manager .instance_meshlet_slice_starts .binding() .unwrap(), cluster_instance_ids.as_entire_binding(), cluster_meshlet_ids.as_entire_binding(), + fill_cluster_buffers_global_cluster_count.as_entire_binding(), )); let fill_cluster_buffers = render_device.create_bind_group( "meshlet_fill_cluster_buffers", diff --git a/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs b/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs index 3dceb239ccda3..f0de6d7769a39 100644 --- a/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs +++ b/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs @@ -118,8 +118,7 @@ impl Node for MeshletVisibilityBufferRasterPassNode { render_context, &meshlet_view_bind_groups.fill_cluster_buffers, fill_cluster_buffers_pipeline, - thread_per_cluster_workgroups, - meshlet_view_resources.scene_cluster_count, + meshlet_view_resources.scene_instance_count, ); } cull_pass( @@ -130,6 +129,7 @@ impl Node for MeshletVisibilityBufferRasterPassNode { previous_view_offset, culling_first_pipeline, thread_per_cluster_workgroups, + meshlet_view_resources.scene_cluster_count, meshlet_view_resources.raster_cluster_rightmost_slot, meshlet_view_bind_groups .remap_1d_to_2d_dispatch @@ -165,6 +165,7 @@ impl Node for MeshletVisibilityBufferRasterPassNode { previous_view_offset, culling_second_pipeline, thread_per_cluster_workgroups, + meshlet_view_resources.scene_cluster_count, meshlet_view_resources.raster_cluster_rightmost_slot, meshlet_view_bind_groups .remap_1d_to_2d_dispatch @@ -253,6 +254,7 @@ impl Node for MeshletVisibilityBufferRasterPassNode { previous_view_offset, culling_first_pipeline, thread_per_cluster_workgroups, + meshlet_view_resources.scene_cluster_count, meshlet_view_resources.raster_cluster_rightmost_slot, meshlet_view_bind_groups .remap_1d_to_2d_dispatch @@ -288,6 +290,7 @@ impl Node for MeshletVisibilityBufferRasterPassNode { previous_view_offset, culling_second_pipeline, thread_per_cluster_workgroups, + meshlet_view_resources.scene_cluster_count, meshlet_view_resources.raster_cluster_rightmost_slot, meshlet_view_bind_groups .remap_1d_to_2d_dispatch @@ -334,21 +337,32 @@ fn fill_cluster_buffers_pass( render_context: &mut RenderContext, fill_cluster_buffers_bind_group: &BindGroup, fill_cluster_buffers_pass_pipeline: &ComputePipeline, - fill_cluster_buffers_pass_workgroups: u32, - cluster_count: u32, + scene_instance_count: u32, ) { + let mut fill_cluster_buffers_pass_workgroups_x = scene_instance_count; + let mut fill_cluster_buffers_pass_workgroups_y = 1; + if scene_instance_count + > render_context + .render_device() + .limits() + .max_compute_workgroups_per_dimension + { + fill_cluster_buffers_pass_workgroups_x = (scene_instance_count as f32).sqrt().ceil() as u32; + fill_cluster_buffers_pass_workgroups_y = fill_cluster_buffers_pass_workgroups_x; + } + let command_encoder = render_context.command_encoder(); let mut fill_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor { label: Some("fill_cluster_buffers"), timestamp_writes: None, }); fill_pass.set_pipeline(fill_cluster_buffers_pass_pipeline); - fill_pass.set_push_constants(0, &cluster_count.to_le_bytes()); + fill_pass.set_push_constants(0, &scene_instance_count.to_le_bytes()); fill_pass.set_bind_group(0, fill_cluster_buffers_bind_group, &[]); fill_pass.dispatch_workgroups( - fill_cluster_buffers_pass_workgroups, - fill_cluster_buffers_pass_workgroups, - fill_cluster_buffers_pass_workgroups, + fill_cluster_buffers_pass_workgroups_x, + fill_cluster_buffers_pass_workgroups_y, + 1, ); } @@ -361,17 +375,26 @@ fn cull_pass( previous_view_offset: &PreviousViewUniformOffset, culling_pipeline: &ComputePipeline, culling_workgroups: u32, + scene_cluster_count: u32, raster_cluster_rightmost_slot: u32, remap_1d_to_2d_dispatch_bind_group: Option<&BindGroup>, remap_1d_to_2d_dispatch_pipeline: Option<&ComputePipeline>, ) { + let max_compute_workgroups_per_dimension = render_context + .render_device() + .limits() + .max_compute_workgroups_per_dimension; + let command_encoder = render_context.command_encoder(); let mut cull_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor { label: Some(label), timestamp_writes: None, }); cull_pass.set_pipeline(culling_pipeline); - cull_pass.set_push_constants(0, &raster_cluster_rightmost_slot.to_le_bytes()); + cull_pass.set_push_constants( + 0, + bytemuck::cast_slice(&[scene_cluster_count, raster_cluster_rightmost_slot]), + ); cull_pass.set_bind_group( 0, culling_bind_group, @@ -384,6 +407,7 @@ fn cull_pass( remap_1d_to_2d_dispatch_bind_group, ) { cull_pass.set_pipeline(remap_1d_to_2d_dispatch_pipeline); + cull_pass.set_push_constants(0, &max_compute_workgroups_per_dimension.to_be_bytes()); cull_pass.set_bind_group(0, remap_1d_to_2d_dispatch_bind_group, &[]); cull_pass.dispatch_workgroups(1, 1, 1); } diff --git a/crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl b/crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl index 4346a3b29c942..33c8df6a0e2c6 100644 --- a/crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl +++ b/crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl @@ -172,7 +172,7 @@ fn resolve_vertex_output(frag_coord: vec4) -> VertexOutput { ddy_uv, world_tangent, instance_uniform.flags, - cluster_id, + instance_id ^ meshlet_id, #ifdef PREPASS_FRAGMENT #ifdef MOTION_VECTOR_PREPASS motion_vector, diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs index 63ff1d87b1246..6fae9a60accdc 100644 --- a/crates/bevy_pbr/src/prepass/mod.rs +++ b/crates/bevy_pbr/src/prepass/mod.rs @@ -216,7 +216,7 @@ pub fn update_previous_view_data( } } -#[derive(Component)] +#[derive(Component, Default)] pub struct PreviousGlobalTransform(pub Affine3A); #[cfg(not(feature = "meshlet"))]