From 357d4ad2f69a534423a67f2a46a86502f2d66456 Mon Sep 17 00:00:00 2001 From: Patrick Walton Date: Fri, 3 Jan 2025 10:41:15 -0800 Subject: [PATCH] Implement experimental GPU two-phase occlusion culling for the standard 3D mesh pipeline. *Occlusion culling* allows the GPU to skip the vertex and fragment shading overhead for objects that can be quickly proved to be invisible because they're behind other geometry. A depth prepass already eliminates most fragment shading overhead for occluded objects, but the vertex shading overhead, as well as the cost of testing and rejecting fragments against the Z-buffer, is presently unavoidable for standard meshes. We currently perform occlusion culling only for meshlets. But other meshes, such as skinned meshes, can benefit from occlusion culling too in order to avoid the transform and skinning overhead for unseen meshes. This commit adapts the same [*two-phase occlusion culling*] technique that meshlets use to Bevy's standard 3D mesh pipeline when the new `OcclusionCulling` component, as well as the `DepthPrepass` component, are present on the camera. It has these steps: 1. *Early depth prepass*: We use the hierarchical Z-buffer from the previous frame to cull meshes for the initial depth prepass, effectively rendering only the meshes that were visible in the last frame. 2. *Early depth downsample*: We downsample the depth buffer to create another hierarchical Z-buffer, this time with the current view transform. 3. *Late depth prepass*: We use the new hierarchical Z-buffer to test all meshes that weren't rendered in the early depth prepass. Any meshes that pass this check are rendered. 4. *Late depth downsample*: Again, we downsample the depth buffer to create a hierarchical Z-buffer in preparation for the early depth prepass of the next frame. This step is done after all the rendering, in order to account for custom phase items that might write to the depth buffer. Note that this patch has no effect on the per-mesh CPU overhead for occluded objects, which remains high for a GPU-driven renderer due to the lack of `cold-specialization` and retained bins. If `cold-specialization` and retained bins weren't on the horizon, then a more traditional approach like potentially visible sets (PVS) or low-res CPU rendering would probably be more efficient than the GPU-driven approach that this patch implements for most scenes. However, at this point the amount of effort required to implement a PVS baking tool or a low-res CPU renderer would probably be greater than landing `cold-specialization` and retained bins, and the GPU driven approach is the more modern one anyway. It does mean that the performance improvements from occlusion culling as implemented in this patch *today* are likely to be limited, because of the high CPU overhead for occluded meshes. Note also that this patch currently doesn't implement occlusion culling for 2D objects or shadow maps. Those can be addressed in a follow-up. Additionally, note that the techniques in this patch require compute shaders, which excludes support for WebGL 2. This PR is marked experimental because of known precision issues with the downsampling approach when applied to non-power-of-two framebuffer sizes (i.e. most of them). These precision issues can, in rare cases, cause objects to be judged occluded that in fact are not. (I've never seen this in practice, but I know it's possible; it tends to be likelier to happen with small meshes.) As a follow-up to this patch, we desire to switch to the [SPD-based hi-Z buffer shader from the Granite engine], which doesn't suffer from these problems, at which point we should be able to graduate this feature from experimental status. I opted not to include that rewrite in this patch for two reasons: (1) @JMS55 is planning on doing the rewrite to coincide with the new availability of image atomic operations in Naga; (2) to reduce the scope of this patch. [*two-phase occlusion culling*]: https://medium.com/@mil_kru/two-pass-occlusion-culling-4100edcad501 [Aaltonen SIGGRAPH 2015]: https://www.advances.realtimerendering.com/s2015/aaltonenhaar_siggraph2015_combined_final_footer_220dpi.pdf [Some literature]: https://gist.github.com/reduz/c5769d0e705d8ab7ac187d63be0099b5?permalink_comment_id=5040452#gistcomment-5040452 [SPD-based hi-Z buffer shader from the Granite engine]: https://github.com/Themaister/Granite/blob/master/assets/shaders/post/hiz.comp --- Cargo.toml | 11 + crates/bevy_core_pipeline/Cargo.toml | 1 + crates/bevy_core_pipeline/src/core_2d/mod.rs | 2 + crates/bevy_core_pipeline/src/core_3d/mod.rs | 36 +- .../mip_generation}/downsample_depth.wgsl | 30 +- .../src/experimental/mip_generation/mod.rs | 742 +++++++ .../src/experimental/mod.rs | 11 + crates/bevy_core_pipeline/src/lib.rs | 18 +- crates/bevy_core_pipeline/src/prepass/mod.rs | 1 + crates/bevy_core_pipeline/src/prepass/node.rs | 306 +-- crates/bevy_pbr/src/lib.rs | 8 +- .../bevy_pbr/src/meshlet/cull_clusters.wgsl | 23 +- crates/bevy_pbr/src/meshlet/mod.rs | 7 - crates/bevy_pbr/src/meshlet/pipelines.rs | 27 +- .../bevy_pbr/src/meshlet/resource_manager.rs | 108 +- .../meshlet/visibility_buffer_raster_node.rs | 61 +- crates/bevy_pbr/src/prepass/mod.rs | 5 +- .../src/prepass/prepass_bindings.wgsl | 3 +- .../src/render/build_indirect_params.wgsl | 28 +- crates/bevy_pbr/src/render/gpu_preprocess.rs | 1919 +++++++++++++---- crates/bevy_pbr/src/render/mesh.rs | 15 +- .../bevy_pbr/src/render/mesh_preprocess.wgsl | 229 +- .../src/render/mesh_preprocess_types.wgsl | 98 - .../src/render/occlusion_culling.wgsl | 30 + .../src/render/reset_indirect_batch_sets.wgsl | 25 + .../src/render/view_transformations.wgsl | 30 + .../src/batching/gpu_preprocessing.rs | 310 ++- crates/bevy_render/src/experimental/mod.rs | 6 + .../mesh_preprocess_types.wgsl | 60 + .../src/experimental/occlusion_culling/mod.rs | 84 + crates/bevy_render/src/lib.rs | 13 +- .../src/render_resource/buffer_vec.rs | 12 + crates/bevy_render/src/view/mod.rs | 3 + crates/bevy_sprite/src/mesh2d/material.rs | 3 +- crates/bevy_sprite/src/mesh2d/mesh.rs | 3 +- examples/3d/occlusion_culling.rs | 587 +++++ examples/README.md | 1 + examples/shader/specialized_mesh_pipeline.rs | 33 +- examples/tools/scene_viewer/main.rs | 41 +- 39 files changed, 4021 insertions(+), 909 deletions(-) rename crates/{bevy_pbr/src/meshlet => bevy_core_pipeline/src/experimental/mip_generation}/downsample_depth.wgsl (92%) create mode 100644 crates/bevy_core_pipeline/src/experimental/mip_generation/mod.rs create mode 100644 crates/bevy_core_pipeline/src/experimental/mod.rs delete mode 100644 crates/bevy_pbr/src/render/mesh_preprocess_types.wgsl create mode 100644 crates/bevy_pbr/src/render/occlusion_culling.wgsl create mode 100644 crates/bevy_pbr/src/render/reset_indirect_batch_sets.wgsl create mode 100644 crates/bevy_render/src/experimental/mod.rs create mode 100644 crates/bevy_render/src/experimental/occlusion_culling/mesh_preprocess_types.wgsl create mode 100644 crates/bevy_render/src/experimental/occlusion_culling/mod.rs create mode 100644 examples/3d/occlusion_culling.rs diff --git a/Cargo.toml b/Cargo.toml index fe1695c58f9361..5ec30879497403 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4062,3 +4062,14 @@ name = "Directional Navigation" description = "Demonstration of Directional Navigation between UI elements" category = "UI (User Interface)" wasm = true + +[[example]] +name = "occlusion_culling" +path = "examples/3d/occlusion_culling.rs" +doc-scrape-examples = true + +[package.metadata.example.occlusion_culling] +name = "Occlusion Culling" +description = "Demonstration of Occlusion Culling" +category = "3D Rendering" +wasm = false diff --git a/crates/bevy_core_pipeline/Cargo.toml b/crates/bevy_core_pipeline/Cargo.toml index 0042b77180deb4..5b9c9b6538753c 100644 --- a/crates/bevy_core_pipeline/Cargo.toml +++ b/crates/bevy_core_pipeline/Cargo.toml @@ -43,6 +43,7 @@ nonmax = "0.5" smallvec = "1" thiserror = { version = "2", default-features = false } tracing = { version = "0.1", default-features = false, features = ["std"] } +bytemuck = { version = "1" } [lints] workspace = true diff --git a/crates/bevy_core_pipeline/src/core_2d/mod.rs b/crates/bevy_core_pipeline/src/core_2d/mod.rs index ec0fa58d73f608..1c2dc5661874e8 100644 --- a/crates/bevy_core_pipeline/src/core_2d/mod.rs +++ b/crates/bevy_core_pipeline/src/core_2d/mod.rs @@ -312,6 +312,8 @@ impl PhaseItem for AlphaMask2d { } impl BinnedPhaseItem for AlphaMask2d { + // Since 2D meshes presently can't be multidrawn, the batch set key is + // irrelevant. type BatchSetKey = BatchSetKey2d; type BinKey = AlphaMask2dBinKey; diff --git a/crates/bevy_core_pipeline/src/core_3d/mod.rs b/crates/bevy_core_pipeline/src/core_3d/mod.rs index 393508047a0175..1244ef9368553b 100644 --- a/crates/bevy_core_pipeline/src/core_3d/mod.rs +++ b/crates/bevy_core_pipeline/src/core_3d/mod.rs @@ -16,7 +16,9 @@ pub mod graph { #[derive(Debug, Hash, PartialEq, Eq, Clone, RenderLabel)] pub enum Node3d { MsaaWriteback, - Prepass, + EarlyPrepass, + EarlyDownsampleDepth, + LatePrepass, DeferredPrepass, CopyDeferredLightingId, EndPrepasses, @@ -25,6 +27,7 @@ pub mod graph { MainTransmissivePass, MainTransparentPass, EndMainPass, + LateDownsampleDepth, Taa, MotionBlur, Bloom, @@ -67,9 +70,10 @@ use core::ops::Range; use bevy_render::{ batching::gpu_preprocessing::{GpuPreprocessingMode, GpuPreprocessingSupport}, + experimental::occlusion_culling::OcclusionCulling, mesh::allocator::SlabId, render_phase::PhaseItemBatchSetKey, - view::{NoIndirectDrawing, RetainedViewEntity}, + view::{prepare_view_targets, NoIndirectDrawing, RetainedViewEntity}, }; pub use camera_3d::*; pub use main_opaque_pass_3d_node::*; @@ -114,8 +118,9 @@ use crate::{ }, dof::DepthOfFieldNode, prepass::{ - node::PrepassNode, AlphaMask3dPrepass, DeferredPrepass, DepthPrepass, MotionVectorPrepass, - NormalPrepass, Opaque3dPrepass, OpaqueNoLightmap3dBatchSetKey, OpaqueNoLightmap3dBinKey, + node::{EarlyPrepassNode, LatePrepassNode}, + AlphaMask3dPrepass, DeferredPrepass, DepthPrepass, MotionVectorPrepass, NormalPrepass, + Opaque3dPrepass, OpaqueNoLightmap3dBatchSetKey, OpaqueNoLightmap3dBinKey, ViewPrepassTextures, MOTION_VECTOR_PREPASS_FORMAT, NORMAL_PREPASS_FORMAT, }, skybox::SkyboxPlugin, @@ -161,6 +166,9 @@ impl Plugin for Core3dPlugin { ( sort_phase_system::.in_set(RenderSet::PhaseSort), sort_phase_system::.in_set(RenderSet::PhaseSort), + configure_occlusion_culling_view_targets + .after(prepare_view_targets) + .in_set(RenderSet::ManageViews), prepare_core_3d_depth_textures.in_set(RenderSet::PrepareResources), prepare_core_3d_transmission_textures.in_set(RenderSet::PrepareResources), prepare_prepass_textures.in_set(RenderSet::PrepareResources), @@ -169,7 +177,8 @@ impl Plugin for Core3dPlugin { render_app .add_render_sub_graph(Core3d) - .add_render_graph_node::>(Core3d, Node3d::Prepass) + .add_render_graph_node::>(Core3d, Node3d::EarlyPrepass) + .add_render_graph_node::>(Core3d, Node3d::LatePrepass) .add_render_graph_node::>( Core3d, Node3d::DeferredPrepass, @@ -200,7 +209,8 @@ impl Plugin for Core3dPlugin { .add_render_graph_edges( Core3d, ( - Node3d::Prepass, + Node3d::EarlyPrepass, + Node3d::LatePrepass, Node3d::DeferredPrepass, Node3d::CopyDeferredLightingId, Node3d::EndPrepasses, @@ -898,6 +908,20 @@ pub fn prepare_core_3d_transmission_textures( } } +/// Sets the `TEXTURE_BINDING` flag on the depth texture if necessary for +/// occlusion culling. +/// +/// We need that flag to be set in order to read from the texture. +fn configure_occlusion_culling_view_targets( + mut view_targets: Query<&mut Camera3d, (With, With)>, +) { + for mut camera_3d in &mut view_targets { + let mut depth_texture_usages = TextureUsages::from(camera_3d.depth_texture_usages); + depth_texture_usages |= TextureUsages::TEXTURE_BINDING; + camera_3d.depth_texture_usages = depth_texture_usages.into(); + } +} + // Disable MSAA and warn if using deferred rendering pub fn check_msaa(mut deferred_views: Query<&mut Msaa, (With, With)>) { for mut msaa in deferred_views.iter_mut() { diff --git a/crates/bevy_pbr/src/meshlet/downsample_depth.wgsl b/crates/bevy_core_pipeline/src/experimental/mip_generation/downsample_depth.wgsl similarity index 92% rename from crates/bevy_pbr/src/meshlet/downsample_depth.wgsl rename to crates/bevy_core_pipeline/src/experimental/mip_generation/downsample_depth.wgsl index 80dd7d4baafd42..d24afa390dbca5 100644 --- a/crates/bevy_pbr/src/meshlet/downsample_depth.wgsl +++ b/crates/bevy_core_pipeline/src/experimental/mip_generation/downsample_depth.wgsl @@ -1,8 +1,16 @@ #ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT @group(0) @binding(0) var mip_0: array; // Per pixel #else +#ifdef MESHLET @group(0) @binding(0) var mip_0: array; // Per pixel -#endif +#else // MESHLET +#ifdef MULTISAMPLE +@group(0) @binding(0) var mip_0: texture_depth_multisampled_2d; +#else // MULTISAMPLE +@group(0) @binding(0) var mip_0: texture_depth_2d; +#endif // MULTISAMPLE +#endif // MESHLET +#endif // MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT @group(0) @binding(1) var mip_1: texture_storage_2d; @group(0) @binding(2) var mip_2: texture_storage_2d; @group(0) @binding(3) var mip_3: texture_storage_2d; @@ -304,9 +312,25 @@ fn load_mip_0(x: u32, y: u32) -> f32 { let i = y * constants.view_width + x; #ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT return bitcast(u32(mip_0[i] >> 32u)); -#else +#else // MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT +#ifdef MESHLET return bitcast(mip_0[i]); -#endif +#else // MESHLET + // Downsample the top level. +#ifdef MULTISAMPLE + // The top level is multisampled, so we need to loop over all the samples + // and reduce them to 1. + var result = textureLoad(mip_0, vec2(x, y), 0); + let sample_count = i32(textureNumSamples(mip_0)); + for (var sample = 1; sample < sample_count; sample += 1) { + result = min(result, textureLoad(mip_0, vec2(x, y), sample)); + } + return result; +#else // MULTISAMPLE + return textureLoad(mip_0, vec2(x, y), 0); +#endif // MULTISAMPLE +#endif // MESHLET +#endif // MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT } fn reduce_4(v: vec4f) -> f32 { diff --git a/crates/bevy_core_pipeline/src/experimental/mip_generation/mod.rs b/crates/bevy_core_pipeline/src/experimental/mip_generation/mod.rs new file mode 100644 index 00000000000000..b0ae46d3779172 --- /dev/null +++ b/crates/bevy_core_pipeline/src/experimental/mip_generation/mod.rs @@ -0,0 +1,742 @@ +//! Downsampling of textures to produce mipmap levels. +//! +//! Currently, this module only supports generation of hierarchical Z buffers +//! for occlusion culling. It's marked experimental because the shader is +//! designed only for power-of-two texture sizes and is slightly incorrect for +//! non-power-of-two depth buffer sizes. + +use core::array; + +use bevy_app::{App, Plugin}; +use bevy_asset::{load_internal_asset, Handle}; +use bevy_derive::{Deref, DerefMut}; +use bevy_ecs::{ + component::Component, + entity::Entity, + query::{QueryItem, With}, + schedule::IntoSystemConfigs as _, + system::{lifetimeless::Read, Commands, Query, Res, ResMut, Resource}, + world::{FromWorld, World}, +}; +use bevy_math::{uvec2, UVec2, Vec4Swizzles as _}; +use bevy_render::{ + experimental::occlusion_culling::OcclusionCulling, + render_graph::{NodeRunError, RenderGraphApp, RenderGraphContext, ViewNode, ViewNodeRunner}, + render_resource::{ + binding_types::{sampler, texture_2d, texture_2d_multisampled, texture_storage_2d}, + BindGroup, BindGroupEntries, BindGroupLayout, BindGroupLayoutEntries, + CachedComputePipelineId, ComputePassDescriptor, ComputePipeline, ComputePipelineDescriptor, + Extent3d, IntoBinding, PipelineCache, PushConstantRange, Sampler, SamplerBindingType, + SamplerDescriptor, Shader, ShaderStages, SpecializedComputePipeline, + SpecializedComputePipelines, StorageTextureAccess, TextureAspect, TextureDescriptor, + TextureDimension, TextureFormat, TextureSampleType, TextureUsages, TextureView, + TextureViewDescriptor, TextureViewDimension, + }, + renderer::{RenderContext, RenderDevice}, + texture::TextureCache, + view::{ExtractedView, ViewDepthTexture}, + Render, RenderApp, RenderSet, +}; +use bitflags::bitflags; + +use crate::{ + core_3d::{ + graph::{Core3d, Node3d}, + prepare_core_3d_depth_textures, + }, + prepass::DepthPrepass, +}; + +/// Identifies the `downsample_depth.wgsl` shader. +pub const DOWNSAMPLE_DEPTH_SHADER_HANDLE: Handle = + Handle::weak_from_u128(3876351454330663524); + +/// The maximum number of mip levels that we can produce. +/// +/// 2^12 is 4096, so that's the maximum size of the depth buffer that we +/// support. +pub const DEPTH_PYRAMID_MIP_COUNT: usize = 12; + +/// A plugin that allows Bevy to repeatedly downsample textures to create +/// mipmaps. +/// +/// Currently, this is only used for hierarchical Z buffer generation for the +/// purposes of occlusion culling. +pub struct MipGenerationPlugin; + +impl Plugin for MipGenerationPlugin { + fn build(&self, app: &mut App) { + load_internal_asset!( + app, + DOWNSAMPLE_DEPTH_SHADER_HANDLE, + "downsample_depth.wgsl", + Shader::from_wgsl + ); + + let Some(render_app) = app.get_sub_app_mut(RenderApp) else { + return; + }; + + render_app + .init_resource::>() + .add_render_graph_node::>( + Core3d, + Node3d::EarlyDownsampleDepth, + ) + .add_render_graph_node::>( + Core3d, + Node3d::LateDownsampleDepth, + ) + .add_render_graph_edges( + Core3d, + ( + Node3d::EarlyPrepass, + Node3d::EarlyDownsampleDepth, + Node3d::LatePrepass, + Node3d::DeferredPrepass, + ), + ) + .add_render_graph_edges( + Core3d, + ( + Node3d::EndMainPass, + Node3d::LateDownsampleDepth, + Node3d::EndMainPassPostProcessing, + ), + ) + .add_systems( + Render, + ( + prepare_view_depth_pyramids, + prepare_downsample_depth_view_bind_groups, + ) + .chain() + .in_set(RenderSet::PrepareResources) + .after(prepare_core_3d_depth_textures), + ); + } + + fn finish(&self, app: &mut App) { + let Some(render_app) = app.get_sub_app_mut(RenderApp) else { + return; + }; + render_app + .init_resource::() + .init_resource::(); + } +} + +/// The first node that produces a hierarchical Z-buffer, also known as a depth +/// pyramid. +/// +/// This runs the single-pass downsampling (SPD) shader with the *min* filter in +/// order to generate a series of mipmaps for the Z buffer. The resulting +/// hierarchical Z buffer can be used for occlusion culling. +/// +/// This is the first hierarchical Z-buffer stage, which runs after the early +/// prepass and before the late prepass. It prepares the Z-buffer for the +/// bounding box tests that the late mesh preprocessing stage will perform. +/// +/// This node won't do anything if occlusion culling isn't on. +#[derive(Default)] +pub struct EarlyDownsampleDepthNode; + +impl ViewNode for EarlyDownsampleDepthNode { + type ViewQuery = ( + Read, + Read, + Read, + ); + + fn run<'w>( + &self, + _: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + (view_depth_pyramid, view_downsample_depth_bind_group, view_depth_texture): QueryItem< + 'w, + Self::ViewQuery, + >, + world: &'w World, + ) -> Result<(), NodeRunError> { + downsample_depth( + "early downsample depth", + render_context, + view_depth_pyramid, + view_downsample_depth_bind_group, + view_depth_texture, + world, + ) + } +} + +/// The second node that produces a hierarchical Z-buffer, also known as a depth +/// pyramid. +/// +/// This runs the single-pass downsampling (SPD) shader with the *min* filter in +/// order to generate a series of mipmaps for the Z buffer. The resulting +/// hierarchical Z buffer can be used for occlusion culling. +/// +/// This is the second hierarchical Z-buffer stage, which runs at the end of the +/// main phase. It prepares the Z-buffer for the occlusion culling that the +/// early mesh preprocessing phase of the *next* frame will perform. +/// +/// This node won't do anything if occlusion culling isn't on. +#[derive(Default)] +pub struct LateDownsampleDepthNode; + +impl ViewNode for LateDownsampleDepthNode { + type ViewQuery = ( + Read, + Read, + Read, + ); + + fn run<'w>( + &self, + _: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + (view_depth_pyramid, view_downsample_depth_bind_group, view_depth_texture): QueryItem< + 'w, + Self::ViewQuery, + >, + world: &'w World, + ) -> Result<(), NodeRunError> { + downsample_depth( + "late downsample depth", + render_context, + view_depth_pyramid, + view_downsample_depth_bind_group, + view_depth_texture, + world, + ) + } +} + +/// Produces a depth pyramid from the current depth buffer for a single view. +/// +/// The resulting depth pyramid can be used for occlusion testing. +fn downsample_depth( + label: &str, + render_context: &mut RenderContext, + view_depth_pyramid: &ViewDepthPyramid, + view_downsample_depth_bind_group: &ViewDownsampleDepthBindGroup, + view_depth_texture: &ViewDepthTexture, + world: &World, +) -> Result<(), NodeRunError> { + let downsample_depth_pipelines = world.resource::(); + let pipeline_cache = world.resource::(); + + // Despite the name "single-pass downsampling", we actually need two + // passes because of the lack of `coherent` buffers in WGPU/WGSL. + // Between each pass, there's an implicit synchronization barrier. + + // Fetch the appropriate pipeline ID, depending on whether the depth + // buffer is multisampled or not. + let (Some(first_downsample_depth_pipeline_id), Some(second_downsample_depth_pipeline_id)) = + (if view_depth_texture.texture.sample_count() > 1 { + ( + downsample_depth_pipelines.first_multisample.pipeline_id, + downsample_depth_pipelines.second_multisample.pipeline_id, + ) + } else { + ( + downsample_depth_pipelines.first.pipeline_id, + downsample_depth_pipelines.second.pipeline_id, + ) + }) + else { + return Ok(()); + }; + + // Fetch the pipelines for the two passes. + let (Some(first_downsample_depth_pipeline), Some(second_downsample_depth_pipeline)) = ( + pipeline_cache.get_compute_pipeline(first_downsample_depth_pipeline_id), + pipeline_cache.get_compute_pipeline(second_downsample_depth_pipeline_id), + ) else { + return Ok(()); + }; + + // Run the depth downsampling. + let view_size = uvec2( + view_depth_texture.texture.width(), + view_depth_texture.texture.height(), + ); + view_depth_pyramid.downsample_depth( + label, + render_context, + view_size, + view_downsample_depth_bind_group, + first_downsample_depth_pipeline, + second_downsample_depth_pipeline, + ); + Ok(()) +} + +/// A single depth downsample pipeline. +#[derive(Resource)] +pub struct DownsampleDepthPipeline { + /// The bind group layout for this pipeline. + bind_group_layout: BindGroupLayout, + /// A handle that identifies the compiled shader. + pipeline_id: Option, +} + +impl DownsampleDepthPipeline { + /// Creates a new [`DownsampleDepthPipeline`] from a bind group layout. + /// + /// This doesn't actually specialize the pipeline; that must be done + /// afterward. + fn new(bind_group_layout: BindGroupLayout) -> DownsampleDepthPipeline { + DownsampleDepthPipeline { + bind_group_layout, + pipeline_id: None, + } + } +} + +/// Stores all depth buffer downsampling pipelines. +#[derive(Resource)] +pub struct DownsampleDepthPipelines { + /// The first pass of the pipeline, when the depth buffer is *not* + /// multisampled. + first: DownsampleDepthPipeline, + /// The second pass of the pipeline, when the depth buffer is *not* + /// multisampled. + second: DownsampleDepthPipeline, + /// The first pass of the pipeline, when the depth buffer is multisampled. + first_multisample: DownsampleDepthPipeline, + /// The second pass of the pipeline, when the depth buffer is multisampled. + second_multisample: DownsampleDepthPipeline, + /// The sampler that the depth downsampling shader uses to sample the depth + /// buffer. + sampler: Sampler, +} + +impl FromWorld for DownsampleDepthPipelines { + fn from_world(world: &mut World) -> Self { + let render_device = world.resource::(); + + // Create the bind group layouts. The bind group layouts are identical + // between the first and second passes, so the only thing we need to + // treat specially is the type of the first mip level (non-multisampled + // or multisampled). + let standard_bind_group_layout = + create_downsample_depth_bind_group_layout(render_device, false); + let multisampled_bind_group_layout = + create_downsample_depth_bind_group_layout(render_device, true); + + // Create the depth pyramid sampler. This is shared among all shaders. + let sampler = render_device.create_sampler(&SamplerDescriptor { + label: Some("depth pyramid sampler"), + ..SamplerDescriptor::default() + }); + + // Specialize the pipelines. + world.resource_scope::, _>( + |world, mut specialized_compute_pipelines| { + let pipeline_cache = world.resource::(); + + // Initialize the pipelines. + let mut downsample_depth_pipelines = DownsampleDepthPipelines { + first: DownsampleDepthPipeline::new(standard_bind_group_layout.clone()), + second: DownsampleDepthPipeline::new(standard_bind_group_layout.clone()), + first_multisample: DownsampleDepthPipeline::new( + multisampled_bind_group_layout.clone(), + ), + second_multisample: DownsampleDepthPipeline::new( + multisampled_bind_group_layout.clone(), + ), + sampler, + }; + + // Specialize each pipeline with the appropriate + // `DownsampleDepthPipelineKey`. + downsample_depth_pipelines.first.pipeline_id = + Some(specialized_compute_pipelines.specialize( + pipeline_cache, + &downsample_depth_pipelines.first, + DownsampleDepthPipelineKey::empty(), + )); + downsample_depth_pipelines.second.pipeline_id = + Some(specialized_compute_pipelines.specialize( + pipeline_cache, + &downsample_depth_pipelines.second, + DownsampleDepthPipelineKey::SECOND_PHASE, + )); + downsample_depth_pipelines.first_multisample.pipeline_id = + Some(specialized_compute_pipelines.specialize( + pipeline_cache, + &downsample_depth_pipelines.first_multisample, + DownsampleDepthPipelineKey::MULTISAMPLE, + )); + downsample_depth_pipelines.second_multisample.pipeline_id = + Some(specialized_compute_pipelines.specialize( + pipeline_cache, + &downsample_depth_pipelines.second_multisample, + DownsampleDepthPipelineKey::SECOND_PHASE + | DownsampleDepthPipelineKey::MULTISAMPLE, + )); + + downsample_depth_pipelines + }, + ) + } +} + +/// Creates a single bind group layout for the downsample depth pass. +fn create_downsample_depth_bind_group_layout( + render_device: &RenderDevice, + is_multisampled: bool, +) -> BindGroupLayout { + render_device.create_bind_group_layout( + if is_multisampled { + "downsample multisample depth bind group layout" + } else { + "downsample depth bind group layout" + }, + &BindGroupLayoutEntries::sequential( + ShaderStages::COMPUTE, + ( + // We only care about the multisample status of the depth buffer + // for the first mip level. After the first mip level is + // sampled, we drop to a single sample. + if is_multisampled { + texture_2d_multisampled(TextureSampleType::Depth) + } else { + texture_2d(TextureSampleType::Depth) + }, + // All the mip levels follow: + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::ReadWrite), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + sampler(SamplerBindingType::NonFiltering), + ), + ), + ) +} + +bitflags! { + /// Uniquely identifies a configuration of the downsample depth shader. + /// + /// Note that meshlets maintain their downsample depth shaders on their own + /// and don't use this infrastructure; thus there's no flag for meshlets in + /// here, even though the shader has defines for it. + #[derive(Clone, Copy, PartialEq, Eq, Hash)] + pub struct DownsampleDepthPipelineKey: u8 { + /// True if the depth buffer is multisampled. + const MULTISAMPLE = 1; + /// True if this shader is the second phase of the downsample depth + /// process; false if this shader is the first phase. + const SECOND_PHASE = 2; + } +} + +impl SpecializedComputePipeline for DownsampleDepthPipeline { + type Key = DownsampleDepthPipelineKey; + + fn specialize(&self, key: Self::Key) -> ComputePipelineDescriptor { + let mut shader_defs = vec![]; + if key.contains(DownsampleDepthPipelineKey::MULTISAMPLE) { + shader_defs.push("MULTISAMPLE".into()); + } + + let label = format!( + "downsample depth{}{} pipeline", + if key.contains(DownsampleDepthPipelineKey::MULTISAMPLE) { + " multisample" + } else { + "" + }, + if key.contains(DownsampleDepthPipelineKey::SECOND_PHASE) { + " second phase" + } else { + " first phase" + } + ) + .into(); + + ComputePipelineDescriptor { + label: Some(label), + layout: vec![self.bind_group_layout.clone()], + push_constant_ranges: vec![PushConstantRange { + stages: ShaderStages::COMPUTE, + range: 0..8, + }], + shader: DOWNSAMPLE_DEPTH_SHADER_HANDLE, + shader_defs, + entry_point: if key.contains(DownsampleDepthPipelineKey::SECOND_PHASE) { + "downsample_depth_second".into() + } else { + "downsample_depth_first".into() + }, + zero_initialize_workgroup_memory: false, + } + } +} + +/// Stores a placeholder texture that can be bound to a depth pyramid binding if +/// no depth pyramid is needed. +#[derive(Resource, Deref, DerefMut)] +pub struct DepthPyramidDummyTexture(TextureView); + +impl FromWorld for DepthPyramidDummyTexture { + fn from_world(world: &mut World) -> Self { + let render_device = world.resource::(); + + DepthPyramidDummyTexture(create_depth_pyramid_dummy_texture( + render_device, + "depth pyramid dummy texture", + "depth pyramid dummy texture view", + )) + } +} + +/// Creates a placeholder texture that can be bound to a depth pyramid binding +/// if no depth pyramid is needed. +pub fn create_depth_pyramid_dummy_texture( + render_device: &RenderDevice, + texture_label: &'static str, + texture_view_label: &'static str, +) -> TextureView { + render_device + .create_texture(&TextureDescriptor { + label: Some(texture_label), + size: Extent3d { + width: 1, + height: 1, + depth_or_array_layers: 1, + }, + mip_level_count: 1, + sample_count: 1, + dimension: TextureDimension::D2, + format: TextureFormat::R32Float, + usage: TextureUsages::STORAGE_BINDING, + view_formats: &[], + }) + .create_view(&TextureViewDescriptor { + label: Some(texture_view_label), + format: Some(TextureFormat::R32Float), + dimension: Some(TextureViewDimension::D2), + aspect: TextureAspect::All, + base_mip_level: 0, + mip_level_count: Some(1), + base_array_layer: 0, + array_layer_count: Some(1), + }) +} + +/// Stores a hierarchical Z-buffer for a view, which is a series of mipmaps +/// useful for efficient occlusion culling. +/// +/// This will only be present on a view when occlusion culling is enabled. +#[derive(Component)] +pub struct ViewDepthPyramid { + /// A texture view containing the entire depth texture. + pub all_mips: TextureView, + /// A series of texture views containing one mip level each. + pub mips: [TextureView; DEPTH_PYRAMID_MIP_COUNT], + /// The total number of mipmap levels. + /// + /// This is the base-2 logarithm of the greatest dimension of the depth + /// buffer, rounded up. + pub mip_count: u32, +} + +impl ViewDepthPyramid { + /// Allocates a new depth pyramid for a depth buffer with the given size. + pub fn new( + render_device: &RenderDevice, + texture_cache: &mut TextureCache, + depth_pyramid_dummy_texture: &TextureView, + size: UVec2, + texture_label: &'static str, + texture_view_label: &'static str, + ) -> ViewDepthPyramid { + // Calculate the size of the depth pyramid. + let depth_pyramid_size = Extent3d { + width: size.x.div_ceil(2), + height: size.y.div_ceil(2), + depth_or_array_layers: 1, + }; + + // Calculate the number of mip levels we need. + let depth_pyramid_mip_count = depth_pyramid_size.max_mips(TextureDimension::D2); + + // Create the depth pyramid. + let depth_pyramid = texture_cache.get( + render_device, + TextureDescriptor { + label: Some(texture_label), + size: depth_pyramid_size, + mip_level_count: depth_pyramid_mip_count, + sample_count: 1, + dimension: TextureDimension::D2, + format: TextureFormat::R32Float, + usage: TextureUsages::STORAGE_BINDING | TextureUsages::TEXTURE_BINDING, + view_formats: &[], + }, + ); + + // Create individual views for each level of the depth pyramid. + let depth_pyramid_mips = array::from_fn(|i| { + if (i as u32) < depth_pyramid_mip_count { + depth_pyramid.texture.create_view(&TextureViewDescriptor { + label: Some(texture_view_label), + format: Some(TextureFormat::R32Float), + dimension: Some(TextureViewDimension::D2), + aspect: TextureAspect::All, + base_mip_level: i as u32, + mip_level_count: Some(1), + base_array_layer: 0, + array_layer_count: Some(1), + }) + } else { + (*depth_pyramid_dummy_texture).clone() + } + }); + + // Create the view for the depth pyramid as a whole. + let depth_pyramid_all_mips = depth_pyramid.default_view.clone(); + + Self { + all_mips: depth_pyramid_all_mips, + mips: depth_pyramid_mips, + mip_count: depth_pyramid_mip_count, + } + } + + /// Creates a bind group that allows the depth buffer to be attached to the + /// `downsample_depth.wgsl` shader. + pub fn create_bind_group<'a, R>( + &'a self, + render_device: &RenderDevice, + label: &'static str, + bind_group_layout: &BindGroupLayout, + source_image: R, + sampler: &'a Sampler, + ) -> BindGroup + where + R: IntoBinding<'a>, + { + render_device.create_bind_group( + label, + bind_group_layout, + &BindGroupEntries::sequential(( + source_image, + &self.mips[0], + &self.mips[1], + &self.mips[2], + &self.mips[3], + &self.mips[4], + &self.mips[5], + &self.mips[6], + &self.mips[7], + &self.mips[8], + &self.mips[9], + &self.mips[10], + &self.mips[11], + sampler, + )), + ) + } + + /// Invokes the shaders to generate the hierarchical Z-buffer. + /// + /// This is intended to be invoked as part of a render node. + pub fn downsample_depth( + &self, + label: &str, + render_context: &mut RenderContext, + view_size: UVec2, + downsample_depth_bind_group: &BindGroup, + downsample_depth_first_pipeline: &ComputePipeline, + downsample_depth_second_pipeline: &ComputePipeline, + ) { + let command_encoder = render_context.command_encoder(); + let mut downsample_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor { + label: Some(label), + timestamp_writes: None, + }); + downsample_pass.set_pipeline(downsample_depth_first_pipeline); + // Pass the mip count and the texture width as push constants, for + // simplicity. + downsample_pass.set_push_constants(0, bytemuck::cast_slice(&[self.mip_count, view_size.x])); + downsample_pass.set_bind_group(0, downsample_depth_bind_group, &[]); + downsample_pass.dispatch_workgroups(view_size.x.div_ceil(64), view_size.y.div_ceil(64), 1); + + if self.mip_count >= 7 { + downsample_pass.set_pipeline(downsample_depth_second_pipeline); + downsample_pass.dispatch_workgroups(1, 1, 1); + } + } +} + +/// Creates depth pyramids for views that have occlusion culling enabled. +fn prepare_view_depth_pyramids( + mut commands: Commands, + render_device: Res, + mut texture_cache: ResMut, + depth_pyramid_dummy_texture: Res, + views: Query<(Entity, &ExtractedView), (With, With)>, +) { + for (view_entity, view) in &views { + commands.entity(view_entity).insert(ViewDepthPyramid::new( + &render_device, + &mut texture_cache, + &depth_pyramid_dummy_texture, + view.viewport.zw(), + "view depth pyramid texture", + "view depth pyramid texture view", + )); + } +} + +/// The bind group that we use to attach the depth buffer and depth pyramid for +/// a view to the `downsample_depth.wgsl` shader. +/// +/// This will only be present for a view if occlusion culling is enabled. +#[derive(Component, Deref, DerefMut)] +pub struct ViewDownsampleDepthBindGroup(BindGroup); + +/// Creates the [`ViewDownsampleDepthBindGroup`]s for all views with occlusion +/// culling enabled. +fn prepare_downsample_depth_view_bind_groups( + mut commands: Commands, + render_device: Res, + downsample_depth_pipelines: Res, + view_depth_textures: Query<(Entity, &ViewDepthPyramid, &ViewDepthTexture)>, +) { + for (view_entity, view_depth_pyramid, view_depth_texture) in &view_depth_textures { + let is_multisampled = view_depth_texture.texture.sample_count() > 1; + commands + .entity(view_entity) + .insert(ViewDownsampleDepthBindGroup( + view_depth_pyramid.create_bind_group( + &render_device, + if is_multisampled { + "downsample multisample depth bind group" + } else { + "downsample depth bind group" + }, + if is_multisampled { + &downsample_depth_pipelines + .first_multisample + .bind_group_layout + } else { + &downsample_depth_pipelines.first.bind_group_layout + }, + view_depth_texture.view(), + &downsample_depth_pipelines.sampler, + ), + )); + } +} diff --git a/crates/bevy_core_pipeline/src/experimental/mod.rs b/crates/bevy_core_pipeline/src/experimental/mod.rs new file mode 100644 index 00000000000000..4f957477ead6e9 --- /dev/null +++ b/crates/bevy_core_pipeline/src/experimental/mod.rs @@ -0,0 +1,11 @@ +//! Experimental rendering features. +//! +//! Experimental features are features with known problems, missing features, +//! compatibility issues, low performance, and/or future breaking changes, but +//! are included nonetheless for testing purposes. + +pub mod mip_generation; + +pub mod taa { + pub use crate::taa::{TemporalAntiAliasNode, TemporalAntiAliasPlugin, TemporalAntiAliasing}; +} diff --git a/crates/bevy_core_pipeline/src/lib.rs b/crates/bevy_core_pipeline/src/lib.rs index 6c2ee5bec489b0..49b9b7a20b2ee5 100644 --- a/crates/bevy_core_pipeline/src/lib.rs +++ b/crates/bevy_core_pipeline/src/lib.rs @@ -14,6 +14,7 @@ pub mod core_2d; pub mod core_3d; pub mod deferred; pub mod dof; +pub mod experimental; pub mod fullscreen_vertex_shader; pub mod fxaa; pub mod motion_blur; @@ -29,17 +30,6 @@ pub mod upscaling; pub use skybox::Skybox; -/// Experimental features that are not yet finished. Please report any issues you encounter! -/// -/// Expect bugs, missing features, compatibility issues, low performance, and/or future breaking changes. -pub mod experimental { - pub mod taa { - pub use crate::taa::{ - TemporalAntiAliasNode, TemporalAntiAliasPlugin, TemporalAntiAliasing, - }; - } -} - /// The core pipeline prelude. /// /// This includes the most common types in this crate, re-exported for your convenience. @@ -56,6 +46,7 @@ use crate::{ core_3d::Core3dPlugin, deferred::copy_lighting_id::CopyDeferredLightingIdPlugin, dof::DepthOfFieldPlugin, + experimental::mip_generation::MipGenerationPlugin, fullscreen_vertex_shader::FULLSCREEN_SHADER_HANDLE, fxaa::FxaaPlugin, motion_blur::MotionBlurPlugin, @@ -87,10 +78,8 @@ impl Plugin for CorePipelinePlugin { .register_type::() .register_type::() .register_type::() + .add_plugins((Core2dPlugin, Core3dPlugin, CopyDeferredLightingIdPlugin)) .add_plugins(( - Core2dPlugin, - Core3dPlugin, - CopyDeferredLightingIdPlugin, BlitPlugin, MsaaWritebackPlugin, TonemappingPlugin, @@ -103,6 +92,7 @@ impl Plugin for CorePipelinePlugin { SmaaPlugin, PostProcessingPlugin, OrderIndependentTransparencyPlugin, + MipGenerationPlugin, )); } } diff --git a/crates/bevy_core_pipeline/src/prepass/mod.rs b/crates/bevy_core_pipeline/src/prepass/mod.rs index 7fb2dfcea961bf..1e663a79a48b6f 100644 --- a/crates/bevy_core_pipeline/src/prepass/mod.rs +++ b/crates/bevy_core_pipeline/src/prepass/mod.rs @@ -78,6 +78,7 @@ pub struct DeferredPrepass; pub struct PreviousViewData { pub view_from_world: Mat4, pub clip_from_world: Mat4, + pub clip_from_view: Mat4, } #[derive(Resource, Default)] diff --git a/crates/bevy_core_pipeline/src/prepass/node.rs b/crates/bevy_core_pipeline/src/prepass/node.rs index 9019890d7ed86e..2e4931657bd144 100644 --- a/crates/bevy_core_pipeline/src/prepass/node.rs +++ b/crates/bevy_core_pipeline/src/prepass/node.rs @@ -19,13 +19,40 @@ use super::{ ViewPrepassTextures, }; -/// Render node used by the prepass. +/// The phase of the prepass that draws meshes that were visible last frame. /// -/// By default, inserted before the main pass in the render graph. +/// If occlusion culling isn't in use, this prepass simply draws all meshes. +/// +/// Like all prepass nodes, this is inserted before the main pass in the render +/// graph. +#[derive(Default)] +pub struct EarlyPrepassNode; + +impl ViewNode for EarlyPrepassNode { + type ViewQuery = ::ViewQuery; + + fn run<'w>( + &self, + graph: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + view_query: QueryItem<'w, Self::ViewQuery>, + world: &'w World, + ) -> Result<(), NodeRunError> { + run_prepass(graph, render_context, view_query, world, "early prepass") + } +} + +/// The phase of the prepass that runs after occlusion culling against the +/// meshes that were visible last frame. +/// +/// If occlusion culling isn't in use, this is a no-op. +/// +/// Like all prepass nodes, this is inserted before the main pass in the render +/// graph. #[derive(Default)] -pub struct PrepassNode; +pub struct LatePrepassNode; -impl ViewNode for PrepassNode { +impl ViewNode for LatePrepassNode { type ViewQuery = ( &'static ExtractedCamera, &'static ExtractedView, @@ -42,150 +69,159 @@ impl ViewNode for PrepassNode { &self, graph: &mut RenderGraphContext, render_context: &mut RenderContext<'w>, - ( - camera, - extracted_view, - view_depth_texture, - view_prepass_textures, - view_uniform_offset, - deferred_prepass, - skybox_prepass_pipeline, - skybox_prepass_bind_group, - view_prev_uniform_offset, - ): QueryItem<'w, Self::ViewQuery>, + query: QueryItem<'w, Self::ViewQuery>, world: &'w World, ) -> Result<(), NodeRunError> { - let (Some(opaque_prepass_phases), Some(alpha_mask_prepass_phases)) = ( - world.get_resource::>(), - world.get_resource::>(), - ) else { - return Ok(()); - }; - - let (Some(opaque_prepass_phase), Some(alpha_mask_prepass_phase)) = ( - opaque_prepass_phases.get(&extracted_view.retained_view_entity), - alpha_mask_prepass_phases.get(&extracted_view.retained_view_entity), - ) else { - return Ok(()); - }; - - let diagnostics = render_context.diagnostic_recorder(); - - let mut color_attachments = vec![ - view_prepass_textures - .normal - .as_ref() - .map(|normals_texture| normals_texture.get_attachment()), - view_prepass_textures - .motion_vectors - .as_ref() - .map(|motion_vectors_texture| motion_vectors_texture.get_attachment()), - // Use None in place of deferred attachments - None, - None, - ]; - - // If all color attachments are none: clear the color attachment list so that no fragment shader is required - if color_attachments.iter().all(Option::is_none) { - color_attachments.clear(); - } + run_prepass(graph, render_context, query, world, "late prepass") + } +} + +/// Runs a prepass that draws all meshes to the depth buffer, and possibly +/// normal and motion vector buffers as well. +/// +/// If occlusion culling isn't in use, and a prepass is enabled, then there's +/// only one prepass. If occlusion culling is in use, then any prepass is split +/// into two: an *early* prepass and a *late* prepass. The early prepass draws +/// what was visible last frame, and the last prepass performs occlusion culling +/// against a conservative hierarchical Z buffer before drawing unoccluded +/// meshes. +fn run_prepass<'w>( + graph: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + ( + camera, + extracted_view, + view_depth_texture, + view_prepass_textures, + view_uniform_offset, + deferred_prepass, + skybox_prepass_pipeline, + skybox_prepass_bind_group, + view_prev_uniform_offset, + ): QueryItem<'w, ::ViewQuery>, + world: &'w World, + label: &'static str, +) -> Result<(), NodeRunError> { + let (Some(opaque_prepass_phases), Some(alpha_mask_prepass_phases)) = ( + world.get_resource::>(), + world.get_resource::>(), + ) else { + return Ok(()); + }; + + let (Some(opaque_prepass_phase), Some(alpha_mask_prepass_phase)) = ( + opaque_prepass_phases.get(&extracted_view.retained_view_entity), + alpha_mask_prepass_phases.get(&extracted_view.retained_view_entity), + ) else { + return Ok(()); + }; + + let diagnostics = render_context.diagnostic_recorder(); + + let mut color_attachments = vec![ + view_prepass_textures + .normal + .as_ref() + .map(|normals_texture| normals_texture.get_attachment()), + view_prepass_textures + .motion_vectors + .as_ref() + .map(|motion_vectors_texture| motion_vectors_texture.get_attachment()), + // Use None in place of deferred attachments + None, + None, + ]; + + // If all color attachments are none: clear the color attachment list so that no fragment shader is required + if color_attachments.iter().all(Option::is_none) { + color_attachments.clear(); + } + + let depth_stencil_attachment = Some(view_depth_texture.get_attachment(StoreOp::Store)); - let depth_stencil_attachment = Some(view_depth_texture.get_attachment(StoreOp::Store)); + let view_entity = graph.view_entity(); + render_context.add_command_buffer_generation_task(move |render_device| { + #[cfg(feature = "trace")] + let _prepass_span = info_span!("prepass").entered(); - let view_entity = graph.view_entity(); - render_context.add_command_buffer_generation_task(move |render_device| { + // Command encoder setup + let mut command_encoder = render_device.create_command_encoder(&CommandEncoderDescriptor { + label: Some("prepass_command_encoder"), + }); + + // Render pass setup + let render_pass = command_encoder.begin_render_pass(&RenderPassDescriptor { + label: Some(label), + color_attachments: &color_attachments, + depth_stencil_attachment, + timestamp_writes: None, + occlusion_query_set: None, + }); + + let mut render_pass = TrackedRenderPass::new(&render_device, render_pass); + let pass_span = diagnostics.pass_span(&mut render_pass, label); + + if let Some(viewport) = camera.viewport.as_ref() { + render_pass.set_camera_viewport(viewport); + } + + // Opaque draws + if !opaque_prepass_phase.is_empty() { #[cfg(feature = "trace")] - let _prepass_span = info_span!("prepass").entered(); - - // Command encoder setup - let mut command_encoder = - render_device.create_command_encoder(&CommandEncoderDescriptor { - label: Some("prepass_command_encoder"), - }); - - // Render pass setup - let render_pass = command_encoder.begin_render_pass(&RenderPassDescriptor { - label: Some("prepass"), - color_attachments: &color_attachments, - depth_stencil_attachment, - timestamp_writes: None, - occlusion_query_set: None, - }); - - let mut render_pass = TrackedRenderPass::new(&render_device, render_pass); - let pass_span = diagnostics.pass_span(&mut render_pass, "prepass"); - - if let Some(viewport) = camera.viewport.as_ref() { - render_pass.set_camera_viewport(viewport); + let _opaque_prepass_span = info_span!("opaque_prepass").entered(); + if let Err(err) = opaque_prepass_phase.render(&mut render_pass, world, view_entity) { + error!("Error encountered while rendering the opaque prepass phase {err:?}"); } + } - // Opaque draws - if !opaque_prepass_phase.multidrawable_mesh_keys.is_empty() - || !opaque_prepass_phase.batchable_mesh_keys.is_empty() - || !opaque_prepass_phase.unbatchable_mesh_keys.is_empty() + // Alpha masked draws + if !alpha_mask_prepass_phase.is_empty() { + #[cfg(feature = "trace")] + let _alpha_mask_prepass_span = info_span!("alpha_mask_prepass").entered(); + if let Err(err) = alpha_mask_prepass_phase.render(&mut render_pass, world, view_entity) { - #[cfg(feature = "trace")] - let _opaque_prepass_span = info_span!("opaque_prepass").entered(); - if let Err(err) = opaque_prepass_phase.render(&mut render_pass, world, view_entity) - { - error!("Error encountered while rendering the opaque prepass phase {err:?}"); - } - } - - // Alpha masked draws - if !alpha_mask_prepass_phase.is_empty() { - #[cfg(feature = "trace")] - let _alpha_mask_prepass_span = info_span!("alpha_mask_prepass").entered(); - if let Err(err) = - alpha_mask_prepass_phase.render(&mut render_pass, world, view_entity) - { - error!( - "Error encountered while rendering the alpha mask prepass phase {err:?}" - ); - } + error!("Error encountered while rendering the alpha mask prepass phase {err:?}"); } + } - // Skybox draw using a fullscreen triangle - if let ( - Some(skybox_prepass_pipeline), - Some(skybox_prepass_bind_group), - Some(view_prev_uniform_offset), - ) = ( - skybox_prepass_pipeline, - skybox_prepass_bind_group, - view_prev_uniform_offset, - ) { - let pipeline_cache = world.resource::(); - if let Some(pipeline) = - pipeline_cache.get_render_pipeline(skybox_prepass_pipeline.0) - { - render_pass.set_render_pipeline(pipeline); - render_pass.set_bind_group( - 0, - &skybox_prepass_bind_group.0, - &[view_uniform_offset.offset, view_prev_uniform_offset.offset], - ); - render_pass.draw(0..3, 0..1); - } + // Skybox draw using a fullscreen triangle + if let ( + Some(skybox_prepass_pipeline), + Some(skybox_prepass_bind_group), + Some(view_prev_uniform_offset), + ) = ( + skybox_prepass_pipeline, + skybox_prepass_bind_group, + view_prev_uniform_offset, + ) { + let pipeline_cache = world.resource::(); + if let Some(pipeline) = pipeline_cache.get_render_pipeline(skybox_prepass_pipeline.0) { + render_pass.set_render_pipeline(pipeline); + render_pass.set_bind_group( + 0, + &skybox_prepass_bind_group.0, + &[view_uniform_offset.offset, view_prev_uniform_offset.offset], + ); + render_pass.draw(0..3, 0..1); } + } - pass_span.end(&mut render_pass); - drop(render_pass); - - // After rendering to the view depth texture, copy it to the prepass depth texture if deferred isn't going to - if deferred_prepass.is_none() { - if let Some(prepass_depth_texture) = &view_prepass_textures.depth { - command_encoder.copy_texture_to_texture( - view_depth_texture.texture.as_image_copy(), - prepass_depth_texture.texture.texture.as_image_copy(), - view_prepass_textures.size, - ); - } + pass_span.end(&mut render_pass); + drop(render_pass); + + // After rendering to the view depth texture, copy it to the prepass depth texture if deferred isn't going to + if deferred_prepass.is_none() { + if let Some(prepass_depth_texture) = &view_prepass_textures.depth { + command_encoder.copy_texture_to_texture( + view_depth_texture.texture.as_image_copy(), + prepass_depth_texture.texture.texture.as_image_copy(), + view_prepass_textures.size, + ); } + } - command_encoder.finish() - }); + command_encoder.finish() + }); - Ok(()) - } + Ok(()) } diff --git a/crates/bevy_pbr/src/lib.rs b/crates/bevy_pbr/src/lib.rs index 5377dfba3be17b..044b12c78128c5 100644 --- a/crates/bevy_pbr/src/lib.rs +++ b/crates/bevy_pbr/src/lib.rs @@ -95,11 +95,13 @@ pub mod graph { /// Label for the volumetric lighting pass. VolumetricFog, /// Label for the compute shader instance data building pass. - GpuPreprocess, + EarlyGpuPreprocess, + LateGpuPreprocess, /// Label for the screen space reflections pass. ScreenSpaceReflections, - /// Label for the indirect parameters building pass. - BuildIndirectParameters, + EarlyPrepassBuildIndirectParameters, + LatePrepassBuildIndirectParameters, + MainBuildIndirectParameters, } } diff --git a/crates/bevy_pbr/src/meshlet/cull_clusters.wgsl b/crates/bevy_pbr/src/meshlet/cull_clusters.wgsl index 47f6dbb04b6beb..79d93429837ae7 100644 --- a/crates/bevy_pbr/src/meshlet/cull_clusters.wgsl +++ b/crates/bevy_pbr/src/meshlet/cull_clusters.wgsl @@ -16,6 +16,7 @@ constants, MeshletBoundingSphere, } +#import bevy_pbr::occlusion_culling #import bevy_render::maths::affine3_to_square /// Culls individual clusters (1 per thread) in two passes (two pass occlusion culling), and outputs a bitmask of which clusters survived. @@ -82,19 +83,15 @@ fn cull_clusters( let occlusion_culling_bounding_sphere_center_view_space = (view.view_from_world * vec4(occlusion_culling_bounding_sphere_center.xyz, 1.0)).xyz; #endif - var aabb = project_view_space_sphere_to_screen_space_aabb(occlusion_culling_bounding_sphere_center_view_space, occlusion_culling_bounding_sphere_radius); - let depth_pyramid_size_mip_0 = vec2(textureDimensions(depth_pyramid, 0)); - var aabb_width_pixels = (aabb.z - aabb.x) * depth_pyramid_size_mip_0.x; - var aabb_height_pixels = (aabb.w - aabb.y) * depth_pyramid_size_mip_0.y; - let depth_level = max(0, i32(ceil(log2(max(aabb_width_pixels, aabb_height_pixels))))); // TODO: Naga doesn't like this being a u32 - let depth_pyramid_size = vec2(textureDimensions(depth_pyramid, depth_level)); - let aabb_top_left = vec2(aabb.xy * depth_pyramid_size); - - let depth_quad_a = textureLoad(depth_pyramid, aabb_top_left, depth_level).x; - let depth_quad_b = textureLoad(depth_pyramid, aabb_top_left + vec2(1u, 0u), depth_level).x; - let depth_quad_c = textureLoad(depth_pyramid, aabb_top_left + vec2(0u, 1u), depth_level).x; - let depth_quad_d = textureLoad(depth_pyramid, aabb_top_left + vec2(1u, 1u), depth_level).x; - let occluder_depth = min(min(depth_quad_a, depth_quad_b), min(depth_quad_c, depth_quad_d)); + var aabb = project_view_space_sphere_to_screen_space_aabb( + occlusion_culling_bounding_sphere_center_view_space, + occlusion_culling_bounding_sphere_radius + ); + let aabb_pixel_size = occlusion_culling::get_aabb_size_in_pixels(aabb, depth_pyramid); + var aabb_width_pixels = aabb_pixel_size.x; + var aabb_height_pixels = aabb_pixel_size.y; + let occluder_depth = + occlusion_culling::get_occluder_depth(aabb, aabb_pixel_size, depth_pyramid); // Check whether or not the cluster would be occluded if drawn var cluster_visible: bool; diff --git a/crates/bevy_pbr/src/meshlet/mod.rs b/crates/bevy_pbr/src/meshlet/mod.rs index 5db0644f97972f..862db413f88f31 100644 --- a/crates/bevy_pbr/src/meshlet/mod.rs +++ b/crates/bevy_pbr/src/meshlet/mod.rs @@ -175,12 +175,6 @@ impl Plugin for MeshletPlugin { "cull_clusters.wgsl", Shader::from_wgsl ); - load_internal_asset!( - app, - MESHLET_DOWNSAMPLE_DEPTH_SHADER_HANDLE, - "downsample_depth.wgsl", - Shader::from_wgsl - ); load_internal_asset!( app, MESHLET_VISIBILITY_BUFFER_SOFTWARE_RASTER_SHADER_HANDLE, @@ -255,7 +249,6 @@ impl Plugin for MeshletPlugin { NodePbr::ShadowPass, // NodeMeshlet::Prepass, - Node3d::Prepass, // NodeMeshlet::DeferredPrepass, Node3d::DeferredPrepass, diff --git a/crates/bevy_pbr/src/meshlet/pipelines.rs b/crates/bevy_pbr/src/meshlet/pipelines.rs index 97f1203d2d3b25..57e9a2587918f6 100644 --- a/crates/bevy_pbr/src/meshlet/pipelines.rs +++ b/crates/bevy_pbr/src/meshlet/pipelines.rs @@ -1,7 +1,8 @@ use super::resource_manager::ResourceManager; use bevy_asset::Handle; use bevy_core_pipeline::{ - core_3d::CORE_3D_DEPTH_FORMAT, fullscreen_vertex_shader::fullscreen_shader_vertex_state, + core_3d::CORE_3D_DEPTH_FORMAT, experimental::mip_generation::DOWNSAMPLE_DEPTH_SHADER_HANDLE, + fullscreen_vertex_shader::fullscreen_shader_vertex_state, }; use bevy_ecs::{ system::Resource, @@ -12,8 +13,6 @@ use bevy_render::render_resource::*; pub const MESHLET_FILL_CLUSTER_BUFFERS_SHADER_HANDLE: Handle = Handle::weak_from_u128(4325134235233421); pub const MESHLET_CULLING_SHADER_HANDLE: Handle = Handle::weak_from_u128(5325134235233421); -pub const MESHLET_DOWNSAMPLE_DEPTH_SHADER_HANDLE: Handle = - Handle::weak_from_u128(6325134235233421); pub const MESHLET_VISIBILITY_BUFFER_SOFTWARE_RASTER_SHADER_HANDLE: Handle = Handle::weak_from_u128(7325134235233421); pub const MESHLET_VISIBILITY_BUFFER_HARDWARE_RASTER_SHADER_HANDLE: Handle = @@ -119,8 +118,11 @@ impl FromWorld for MeshletPipelines { stages: ShaderStages::COMPUTE, range: 0..8, }], - shader: MESHLET_DOWNSAMPLE_DEPTH_SHADER_HANDLE, - shader_defs: vec!["MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT".into()], + shader: DOWNSAMPLE_DEPTH_SHADER_HANDLE, + shader_defs: vec![ + "MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT".into(), + "MESHLET".into(), + ], entry_point: "downsample_depth_first".into(), zero_initialize_workgroup_memory: false, }, @@ -134,8 +136,11 @@ impl FromWorld for MeshletPipelines { stages: ShaderStages::COMPUTE, range: 0..8, }], - shader: MESHLET_DOWNSAMPLE_DEPTH_SHADER_HANDLE, - shader_defs: vec!["MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT".into()], + shader: DOWNSAMPLE_DEPTH_SHADER_HANDLE, + shader_defs: vec![ + "MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT".into(), + "MESHLET".into(), + ], entry_point: "downsample_depth_second".into(), zero_initialize_workgroup_memory: false, }, @@ -149,8 +154,8 @@ impl FromWorld for MeshletPipelines { stages: ShaderStages::COMPUTE, range: 0..8, }], - shader: MESHLET_DOWNSAMPLE_DEPTH_SHADER_HANDLE, - shader_defs: vec![], + shader: DOWNSAMPLE_DEPTH_SHADER_HANDLE, + shader_defs: vec!["MESHLET".into()], entry_point: "downsample_depth_first".into(), zero_initialize_workgroup_memory: false, }, @@ -164,8 +169,8 @@ impl FromWorld for MeshletPipelines { stages: ShaderStages::COMPUTE, range: 0..8, }], - shader: MESHLET_DOWNSAMPLE_DEPTH_SHADER_HANDLE, - shader_defs: vec![], + shader: DOWNSAMPLE_DEPTH_SHADER_HANDLE, + shader_defs: vec!["MESHLET".into()], entry_point: "downsample_depth_second".into(), zero_initialize_workgroup_memory: false, }, diff --git a/crates/bevy_pbr/src/meshlet/resource_manager.rs b/crates/bevy_pbr/src/meshlet/resource_manager.rs index c918990869cec6..54fad19015fbf8 100644 --- a/crates/bevy_pbr/src/meshlet/resource_manager.rs +++ b/crates/bevy_pbr/src/meshlet/resource_manager.rs @@ -3,6 +3,7 @@ use crate::ShadowView; use alloc::sync::Arc; use bevy_core_pipeline::{ core_3d::Camera3d, + experimental::mip_generation::{self, ViewDepthPyramid}, prepass::{PreviousViewData, PreviousViewUniforms}, }; use bevy_ecs::{ @@ -19,7 +20,7 @@ use bevy_render::{ view::{ExtractedView, RenderLayers, ViewUniform, ViewUniforms}, }; use binding_types::*; -use core::{array, iter, sync::atomic::AtomicBool}; +use core::{iter, sync::atomic::AtomicBool}; use encase::internal::WriteInto; /// Manages per-view and per-cluster GPU resources for [`super::MeshletPlugin`]. @@ -84,31 +85,11 @@ impl ResourceManager { label: Some("meshlet_depth_pyramid_sampler"), ..SamplerDescriptor::default() }), - depth_pyramid_dummy_texture: render_device - .create_texture(&TextureDescriptor { - label: Some("meshlet_depth_pyramid_dummy_texture"), - size: Extent3d { - width: 1, - height: 1, - depth_or_array_layers: 1, - }, - mip_level_count: 1, - sample_count: 1, - dimension: TextureDimension::D2, - format: TextureFormat::R32Float, - usage: TextureUsages::STORAGE_BINDING, - view_formats: &[], - }) - .create_view(&TextureViewDescriptor { - label: Some("meshlet_depth_pyramid_dummy_texture_view"), - format: Some(TextureFormat::R32Float), - dimension: Some(TextureViewDimension::D2), - aspect: TextureAspect::All, - base_mip_level: 0, - mip_level_count: Some(1), - base_array_layer: 0, - array_layer_count: Some(1), - }), + depth_pyramid_dummy_texture: mip_generation::create_depth_pyramid_dummy_texture( + render_device, + "meshlet_depth_pyramid_dummy_texture", + "meshlet_depth_pyramid_dummy_texture_view", + ), previous_depth_pyramids: EntityHashMap::default(), @@ -257,9 +238,7 @@ pub struct MeshletViewResources { pub visibility_buffer_software_raster_indirect_args_second: Buffer, pub visibility_buffer_hardware_raster_indirect_args_first: Buffer, pub visibility_buffer_hardware_raster_indirect_args_second: Buffer, - depth_pyramid_all_mips: TextureView, - depth_pyramid_mips: [TextureView; 12], - pub depth_pyramid_mip_count: u32, + pub depth_pyramid: ViewDepthPyramid, previous_depth_pyramid: TextureView, pub material_depth: Option, pub view_size: UVec2, @@ -490,51 +469,23 @@ pub fn prepare_meshlet_per_frame_resources( usage: BufferUsages::STORAGE | BufferUsages::INDIRECT, }); - let depth_pyramid_size = Extent3d { - width: view.viewport.z.div_ceil(2), - height: view.viewport.w.div_ceil(2), - depth_or_array_layers: 1, - }; - let depth_pyramid_mip_count = depth_pyramid_size.max_mips(TextureDimension::D2); - let depth_pyramid = texture_cache.get( + let depth_pyramid = ViewDepthPyramid::new( &render_device, - TextureDescriptor { - label: Some("meshlet_depth_pyramid"), - size: depth_pyramid_size, - mip_level_count: depth_pyramid_mip_count, - sample_count: 1, - dimension: TextureDimension::D2, - format: TextureFormat::R32Float, - usage: TextureUsages::STORAGE_BINDING | TextureUsages::TEXTURE_BINDING, - view_formats: &[], - }, + &mut texture_cache, + &resource_manager.depth_pyramid_dummy_texture, + view.viewport.zw(), + "meshlet_depth_pyramid", + "meshlet_depth_pyramid_texture_view", ); - let depth_pyramid_mips = array::from_fn(|i| { - if (i as u32) < depth_pyramid_mip_count { - depth_pyramid.texture.create_view(&TextureViewDescriptor { - label: Some("meshlet_depth_pyramid_texture_view"), - format: Some(TextureFormat::R32Float), - dimension: Some(TextureViewDimension::D2), - aspect: TextureAspect::All, - base_mip_level: i as u32, - mip_level_count: Some(1), - base_array_layer: 0, - array_layer_count: Some(1), - }) - } else { - resource_manager.depth_pyramid_dummy_texture.clone() - } - }); - let depth_pyramid_all_mips = depth_pyramid.default_view.clone(); let previous_depth_pyramid = match resource_manager.previous_depth_pyramids.get(&view_entity) { Some(texture_view) => texture_view.clone(), - None => depth_pyramid_all_mips.clone(), + None => depth_pyramid.all_mips.clone(), }; resource_manager .previous_depth_pyramids - .insert(view_entity, depth_pyramid_all_mips.clone()); + .insert(view_entity, depth_pyramid.all_mips.clone()); let material_depth = TextureDescriptor { label: Some("meshlet_material_depth"), @@ -562,9 +513,7 @@ pub fn prepare_meshlet_per_frame_resources( visibility_buffer_software_raster_indirect_args_second, visibility_buffer_hardware_raster_indirect_args_first, visibility_buffer_hardware_raster_indirect_args_second, - depth_pyramid_all_mips, - depth_pyramid_mips, - depth_pyramid_mip_count, + depth_pyramid, previous_depth_pyramid, material_depth: not_shadow_view .then(|| texture_cache.get(&render_device, material_depth)), @@ -675,7 +624,7 @@ pub fn prepare_meshlet_view_bind_groups( resource_manager .visibility_buffer_raster_clusters .as_entire_binding(), - &view_resources.depth_pyramid_all_mips, + &view_resources.depth_pyramid.all_mips, view_uniforms.clone(), previous_view_uniforms.clone(), )); @@ -685,25 +634,12 @@ pub fn prepare_meshlet_view_bind_groups( &entries, ); - let downsample_depth = render_device.create_bind_group( + let downsample_depth = view_resources.depth_pyramid.create_bind_group( + &render_device, "meshlet_downsample_depth_bind_group", &resource_manager.downsample_depth_bind_group_layout, - &BindGroupEntries::sequential(( - view_resources.visibility_buffer.as_entire_binding(), - &view_resources.depth_pyramid_mips[0], - &view_resources.depth_pyramid_mips[1], - &view_resources.depth_pyramid_mips[2], - &view_resources.depth_pyramid_mips[3], - &view_resources.depth_pyramid_mips[4], - &view_resources.depth_pyramid_mips[5], - &view_resources.depth_pyramid_mips[6], - &view_resources.depth_pyramid_mips[7], - &view_resources.depth_pyramid_mips[8], - &view_resources.depth_pyramid_mips[9], - &view_resources.depth_pyramid_mips[10], - &view_resources.depth_pyramid_mips[11], - &resource_manager.depth_pyramid_sampler, - )), + view_resources.visibility_buffer.as_entire_binding(), + &resource_manager.depth_pyramid_sampler, ); let entries = BindGroupEntries::sequential(( diff --git a/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs b/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs index 9890deb4dbfc1c..09622dee4bdded 100644 --- a/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs +++ b/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs @@ -149,10 +149,11 @@ impl Node for MeshletVisibilityBufferRasterPassNode { Some(camera), meshlet_view_resources.raster_cluster_rightmost_slot, ); - downsample_depth( + meshlet_view_resources.depth_pyramid.downsample_depth( + "early downsample depth", render_context, - meshlet_view_resources, - meshlet_view_bind_groups, + meshlet_view_resources.view_size, + &meshlet_view_bind_groups.downsample_depth, downsample_depth_first_pipeline, downsample_depth_second_pipeline, ); @@ -200,10 +201,11 @@ impl Node for MeshletVisibilityBufferRasterPassNode { resolve_material_depth_pipeline, camera, ); - downsample_depth( + meshlet_view_resources.depth_pyramid.downsample_depth( + "late downsample depth", render_context, - meshlet_view_resources, - meshlet_view_bind_groups, + meshlet_view_resources.view_size, + &meshlet_view_bind_groups.downsample_depth, downsample_depth_first_pipeline, downsample_depth_second_pipeline, ); @@ -267,10 +269,11 @@ impl Node for MeshletVisibilityBufferRasterPassNode { None, meshlet_view_resources.raster_cluster_rightmost_slot, ); - downsample_depth( + meshlet_view_resources.depth_pyramid.downsample_depth( + "late downsample depth", render_context, - meshlet_view_resources, - meshlet_view_bind_groups, + meshlet_view_resources.view_size, + &meshlet_view_bind_groups.downsample_depth, downsample_depth_first_shadow_view_pipeline, downsample_depth_second_shadow_view_pipeline, ); @@ -311,10 +314,11 @@ impl Node for MeshletVisibilityBufferRasterPassNode { resolve_depth_shadow_view_pipeline, camera, ); - downsample_depth( + meshlet_view_resources.depth_pyramid.downsample_depth( + "main downsample depth", render_context, - meshlet_view_resources, - meshlet_view_bind_groups, + meshlet_view_resources.view_size, + &meshlet_view_bind_groups.downsample_depth, downsample_depth_first_shadow_view_pipeline, downsample_depth_second_shadow_view_pipeline, ); @@ -471,39 +475,6 @@ fn raster_pass( hardware_pass.draw_indirect(visibility_buffer_hardware_raster_indirect_args, 0); } -fn downsample_depth( - render_context: &mut RenderContext, - meshlet_view_resources: &MeshletViewResources, - meshlet_view_bind_groups: &MeshletViewBindGroups, - downsample_depth_first_pipeline: &ComputePipeline, - downsample_depth_second_pipeline: &ComputePipeline, -) { - let command_encoder = render_context.command_encoder(); - let mut downsample_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor { - label: Some("downsample_depth"), - timestamp_writes: None, - }); - downsample_pass.set_pipeline(downsample_depth_first_pipeline); - downsample_pass.set_push_constants( - 0, - bytemuck::cast_slice(&[ - meshlet_view_resources.depth_pyramid_mip_count, - meshlet_view_resources.view_size.x, - ]), - ); - downsample_pass.set_bind_group(0, &meshlet_view_bind_groups.downsample_depth, &[]); - downsample_pass.dispatch_workgroups( - meshlet_view_resources.view_size.x.div_ceil(64), - meshlet_view_resources.view_size.y.div_ceil(64), - 1, - ); - - if meshlet_view_resources.depth_pyramid_mip_count >= 7 { - downsample_pass.set_pipeline(downsample_depth_second_pipeline); - downsample_pass.dispatch_workgroups(1, 1, 1); - } -} - fn resolve_depth( render_context: &mut RenderContext, depth_stencil_attachment: RenderPassDepthStencilAttachment, diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs index 373119f25ffdeb..bb18d721868f9b 100644 --- a/crates/bevy_pbr/src/prepass/mod.rs +++ b/crates/bevy_pbr/src/prepass/mod.rs @@ -202,9 +202,6 @@ where #[derive(Resource)] struct AnyPrepassPluginLoaded; -#[cfg(not(feature = "meshlet"))] -type PreviousViewFilter = (With, With); -#[cfg(feature = "meshlet")] type PreviousViewFilter = Or<(With, With)>; pub fn update_previous_view_data( @@ -216,6 +213,7 @@ pub fn update_previous_view_data( commands.entity(entity).try_insert(PreviousViewData { view_from_world, clip_from_world: camera.clip_from_view() * view_from_world, + clip_from_view: camera.clip_from_view(), }); } } @@ -706,6 +704,7 @@ pub fn prepare_previous_view_uniforms( PreviousViewData { view_from_world, clip_from_world: camera.clip_from_view * view_from_world, + clip_from_view: camera.clip_from_view, } } }; diff --git a/crates/bevy_pbr/src/prepass/prepass_bindings.wgsl b/crates/bevy_pbr/src/prepass/prepass_bindings.wgsl index a8dae83b8e57a2..3bd27b2e037c19 100644 --- a/crates/bevy_pbr/src/prepass/prepass_bindings.wgsl +++ b/crates/bevy_pbr/src/prepass/prepass_bindings.wgsl @@ -3,10 +3,9 @@ struct PreviousViewUniforms { view_from_world: mat4x4, clip_from_world: mat4x4, + clip_from_view: mat4x4, } -#ifdef MOTION_VECTOR_PREPASS @group(0) @binding(2) var previous_view_uniforms: PreviousViewUniforms; -#endif // MOTION_VECTOR_PREPASS // Material bindings will be in @group(2) diff --git a/crates/bevy_pbr/src/render/build_indirect_params.wgsl b/crates/bevy_pbr/src/render/build_indirect_params.wgsl index 90741e9064971b..fb2c02d122fd77 100644 --- a/crates/bevy_pbr/src/render/build_indirect_params.wgsl +++ b/crates/bevy_pbr/src/render/build_indirect_params.wgsl @@ -59,11 +59,28 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { let mesh_index = indirect_parameters_metadata[instance_index].mesh_index; let base_output_index = indirect_parameters_metadata[instance_index].base_output_index; let batch_set_index = indirect_parameters_metadata[instance_index].batch_set_index; - let instance_count = atomicLoad(&indirect_parameters_metadata[instance_index].instance_count); // If we aren't using `multi_draw_indirect_count`, we have a 1:1 fixed // assignment of batches to slots in the indirect parameters buffer, so we // can just use the instance index as the index of our indirect parameters. + let early_instance_count = + atomicLoad(&indirect_parameters_metadata[instance_index].early_instance_count); + let late_instance_count = + atomicLoad(&indirect_parameters_metadata[instance_index].late_instance_count); + + // If in the early phase, we draw only the early meshes. If in the late + // phase, we draw only the late meshes. If in the main phase, draw all the + // meshes. +#ifdef EARLY_PHASE + let instance_count = early_instance_count; +#else // EARLY_PHASE +#ifdef LATE_PHASE + let instance_count = late_instance_count; +#else // LATE_PHASE + let instance_count = early_instance_count + late_instance_count; +#endif // LATE_PHASE +#endif // EARLY_PHASE + var indirect_parameters_index = instance_index; // If the current hardware and driver support `multi_draw_indirect_count`, @@ -90,7 +107,16 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { // non-indexed meshes are slightly different. indirect_parameters[indirect_parameters_index].instance_count = instance_count; + +#ifdef LATE_PHASE + // The late mesh instances are stored after the early mesh instances, so we + // offset the output index by the number of early mesh instances. + indirect_parameters[indirect_parameters_index].first_instance = + base_output_index + early_instance_count; +#else // LATE_PHASE indirect_parameters[indirect_parameters_index].first_instance = base_output_index; +#endif // LATE_PHASE + indirect_parameters[indirect_parameters_index].base_vertex = current_input[mesh_index].first_vertex_index; diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs index 8ac5a7c96e29cb..f6120f776d5233 100644 --- a/crates/bevy_pbr/src/render/gpu_preprocess.rs +++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs @@ -10,29 +10,37 @@ use core::num::{NonZero, NonZeroU64}; use bevy_app::{App, Plugin}; use bevy_asset::{load_internal_asset, Handle}; -use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d}; +use bevy_core_pipeline::{ + core_3d::graph::{Core3d, Node3d}, + experimental::mip_generation::ViewDepthPyramid, + prepass::{DepthPrepass, PreviousViewData, PreviousViewUniformOffset, PreviousViewUniforms}, +}; use bevy_derive::{Deref, DerefMut}; use bevy_ecs::{ component::Component, entity::Entity, - query::{Has, QueryState, Without}, - schedule::{common_conditions::resource_exists, IntoSystemConfigs as _}, - system::{lifetimeless::Read, Commands, Res, ResMut, Resource}, + prelude::resource_exists, + query::{Has, QueryState, With, Without}, + schedule::IntoSystemConfigs as _, + system::{lifetimeless::Read, Commands, Query, Res, ResMut, Resource}, world::{FromWorld, World}, }; use bevy_render::{ batching::gpu_preprocessing::{ - BatchedInstanceBuffers, GpuPreprocessingSupport, IndirectBatchSet, - IndirectParametersBuffers, IndirectParametersIndexed, IndirectParametersMetadata, - IndirectParametersNonIndexed, PreprocessWorkItem, PreprocessWorkItemBuffers, + BatchedInstanceBuffers, GpuOcclusionCullingWorkItemBuffers, GpuPreprocessingSupport, + IndirectBatchSet, IndirectParametersBuffers, IndirectParametersIndexed, + IndirectParametersMetadata, IndirectParametersNonIndexed, + LatePreprocessWorkItemIndirectParameters, PreprocessWorkItem, PreprocessWorkItemBuffers, }, + experimental::occlusion_culling::OcclusionCulling, render_graph::{Node, NodeRunError, RenderGraphApp, RenderGraphContext}, render_resource::{ - binding_types::{storage_buffer, storage_buffer_read_only, uniform_buffer}, + binding_types::{storage_buffer, storage_buffer_read_only, texture_2d, uniform_buffer}, BindGroup, BindGroupEntries, BindGroupLayout, BindingResource, Buffer, BufferBinding, - CachedComputePipelineId, ComputePassDescriptor, ComputePipelineDescriptor, - DynamicBindGroupLayoutEntries, PipelineCache, Shader, ShaderStages, ShaderType, - SpecializedComputePipeline, SpecializedComputePipelines, + BufferVec, CachedComputePipelineId, ComputePassDescriptor, ComputePipelineDescriptor, + DynamicBindGroupLayoutEntries, PipelineCache, RawBufferVec, Shader, ShaderStages, + ShaderType, SpecializedComputePipeline, SpecializedComputePipelines, TextureSampleType, + UninitBufferVec, }, renderer::{RenderContext, RenderDevice, RenderQueue}, settings::WgpuFeatures, @@ -56,6 +64,9 @@ pub const MESH_PREPROCESS_SHADER_HANDLE: Handle = /// The handle to the `mesh_preprocess_types.wgsl` compute shader. pub const MESH_PREPROCESS_TYPES_SHADER_HANDLE: Handle = Handle::weak_from_u128(2720440370122465935); +/// The handle to the `reset_indirect_batch_sets.wgsl` compute shader. +pub const RESET_INDIRECT_BATCH_SETS_SHADER_HANDLE: Handle = + Handle::weak_from_u128(2602194133710559644); /// The handle to the `build_indirect_params.wgsl` compute shader. pub const BUILD_INDIRECT_PARAMS_SHADER_HANDLE: Handle = Handle::weak_from_u128(3711077208359699672); @@ -81,26 +92,54 @@ pub struct GpuMeshPreprocessPlugin { /// done by the CPU), transforms them, and, if indirect drawing is on, populates /// indirect draw parameter metadata for the subsequent /// [`BuildIndirectParametersNode`]. -pub struct GpuPreprocessNode { +/// The render node for the mesh uniform building pass. +pub struct EarlyGpuPreprocessNode { view_query: QueryState< ( Entity, - Read, - Read, + Option>, + Option>, Has, + Has, ), Without, >, main_view_query: QueryState>, } +pub struct LateGpuPreprocessNode { + view_query: QueryState< + (Entity, Read, Read), + ( + Without, + Without, + With, + With, + ), + >, +} + /// The render node for the indirect parameter building pass. /// /// This node runs a compute shader on the output of the [`GpuPreprocessNode`] /// in order to transform the [`IndirectParametersMetadata`] into /// properly-formatted [`IndirectParametersIndexed`] and /// [`IndirectParametersNonIndexed`]. -pub struct BuildIndirectParametersNode { +pub struct EarlyPrepassBuildIndirectParametersNode { + view_query: QueryState< + Read, + (Without, Without), + >, +} + +pub struct LatePrepassBuildIndirectParametersNode { + view_query: QueryState< + Read, + (Without, Without), + >, +} + +pub struct MainBuildIndirectParametersNode { view_query: QueryState< Read, (Without, Without), @@ -114,19 +153,31 @@ pub struct PreprocessPipelines { /// The pipeline used for CPU culling. This pipeline doesn't populate /// indirect parameter metadata. pub direct_preprocess: PreprocessPipeline, + pub gpu_frustum_culling_preprocess: PreprocessPipeline, /// The pipeline used for GPU culling. This pipeline populates indirect /// parameter metadata. - pub gpu_culling_preprocess: PreprocessPipeline, + pub early_gpu_occlusion_culling_preprocess: PreprocessPipeline, + pub late_gpu_occlusion_culling_preprocess: PreprocessPipeline, + pub gpu_frustum_culling_build_indexed_indirect_params: BuildIndirectParametersPipeline, + pub gpu_frustum_culling_build_non_indexed_indirect_params: BuildIndirectParametersPipeline, + pub early_phase: PreprocessPhasePipelines, + pub late_phase: PreprocessPhasePipelines, + pub main_phase: PreprocessPhasePipelines, +} + +#[derive(Clone)] +pub struct PreprocessPhasePipelines { + pub reset_indirect_batch_sets: ResetIndirectBatchSetsPipeline, /// The pipeline used for indexed indirect parameter building. /// /// This pipeline converts indirect parameter metadata into indexed indirect /// parameters. - pub build_indexed_indirect_params: BuildIndirectParametersPipeline, + pub gpu_occlusion_culling_build_indexed_indirect_params: BuildIndirectParametersPipeline, /// The pipeline used for non-indexed indirect parameter building. /// /// This pipeline converts indirect parameter metadata into non-indexed /// indirect parameters. - pub build_non_indexed_indirect_params: BuildIndirectParametersPipeline, + pub gpu_occlusion_culling_build_non_indexed_indirect_params: BuildIndirectParametersPipeline, } /// The pipeline for the GPU mesh preprocessing shader. @@ -139,7 +190,22 @@ pub struct PreprocessPipeline { pub pipeline_id: Option, } +/// The pipeline for the batch set count reset shader. +/// +/// This shader resets the indirect batch set count to 0 for each view. It runs +/// in between every phase (early, late, and main). +#[derive(Clone)] +pub struct ResetIndirectBatchSetsPipeline { + /// The bind group layout for the compute shader. + pub bind_group_layout: BindGroupLayout, + /// The pipeline ID for the compute shader. + /// + /// This gets filled in `prepare_preprocess_pipelines`. + pub pipeline_id: Option, +} + /// The pipeline for the indirect parameter building shader. +#[derive(Clone)] pub struct BuildIndirectParametersPipeline { /// The bind group layout for the compute shader. pub bind_group_layout: BindGroupLayout, @@ -153,10 +219,18 @@ bitflags! { /// Specifies variants of the mesh preprocessing shader. #[derive(Clone, Copy, PartialEq, Eq, Hash)] pub struct PreprocessPipelineKey: u8 { - /// Whether GPU culling is in use. + /// Whether GPU frustum culling is in use. + /// + /// This `#define`'s `FRUSTUM_CULLING` in the shader. + const FRUSTUM_CULLING = 1; + /// Whether GPU two-phase occlusion culling is in use. /// - /// This `#define`'s `GPU_CULLING` in the shader. - const GPU_CULLING = 1; + /// This `#define`'s `OCCLUSION_CULLING` in the shader. + const OCCLUSION_CULLING = 2; + /// Whether this is the early phase of GPU two-phase occlusion culling. + /// + /// This `#define`'s `EARLY_PHASE` in the shader. + const EARLY_PHASE = 4; } /// Specifies variants of the indirect parameter building shader. @@ -171,6 +245,24 @@ bitflags! { /// /// This defines `MULTI_DRAW_INDIRECT_COUNT_SUPPORTED` in the shader. const MULTI_DRAW_INDIRECT_COUNT_SUPPORTED = 2; + /// Whether GPU two-phase occlusion culling is in use. + /// + /// This `#define`'s `OCCLUSION_CULLING` in the shader. + const OCCLUSION_CULLING = 4; + /// Whether this is the early phase of GPU two-phase occlusion culling. + /// + /// This `#define`'s `EARLY_PHASE` in the shader. + const EARLY_PHASE = 8; + /// Whether this is the late phase of GPU two-phase occlusion culling. + /// + /// This `#define`'s `LATE_PHASE` in the shader. + const LATE_PHASE = 16; + /// Whether this is the phase that runs after the early and late phases, + /// and right before the main drawing logic, when GPU two-phase + /// occlusion culling is in use. + /// + /// This `#define`'s `MAIN_PHASE` in the shader. + const MAIN_PHASE = 32; } } @@ -195,35 +287,56 @@ pub enum PhasePreprocessBindGroups { Direct(BindGroup), /// The bind groups used for the compute shader when indirect drawing is - /// being used. + /// being used, but occlusion culling isn't being used. /// /// Because indirect drawing requires splitting the meshes into indexed and /// non-indexed meshes, there are two bind groups here. - Indirect { - /// The bind group used for indexed meshes. - /// - /// This will be `None` if there are no indexed meshes. + IndirectFrustumCulling { + /// The bind group for indexed meshes. indexed: Option, - /// The bind group used for non-indexed meshes. - /// - /// This will be `None` if there are no non-indexed meshes. + /// The bind group for non-indexed meshes. non_indexed: Option, }, + + /// The bind groups used for the compute shader when indirect drawing is + /// being used, but occlusion culling isn't being used. + /// + /// Because indirect drawing requires splitting the meshes into indexed and + /// non-indexed meshes, and because occlusion culling requires splitting + /// this phase into early and late versions, there are four bind groups + /// here. + IndirectOcclusionCulling { + /// The bind group for indexed meshes during the early mesh + /// preprocessing phase. + early_indexed: Option, + /// The bind group for non-indexed meshes during the early mesh + /// preprocessing phase. + early_non_indexed: Option, + /// The bind group for indexed meshes during the late mesh preprocessing + /// phase. + late_indexed: Option, + /// The bind group for non-indexed meshes during the late mesh + /// preprocessing phase. + late_non_indexed: Option, + }, } -/// The bind groups for the indirect parameters building compute shader. -/// -/// This is shared among all views and phases. +/// The bind groups for the compute shaders that reset indirect draw counts and +/// build indirect parameters. #[derive(Resource)] pub struct BuildIndirectParametersBindGroups { - /// The bind group used for indexed meshes. - /// - /// This will be `None` if there are no indexed meshes. - indexed: Option, - /// The bind group used for non-indexed meshes. - /// - /// This will be `None` if there are no non-indexed meshes. - non_indexed: Option, + /// The bind group for the `reset_indirect_batch_sets.wgsl` shader, for + /// indexed meshes. + reset_indexed_indirect_batch_sets: Option, + /// The bind group for the `reset_indirect_batch_sets.wgsl` shader, for + /// non-indexed meshes. + reset_non_indexed_indirect_batch_sets: Option, + /// The bind group for the `build_indirect_params.wgsl` shader, for indexed + /// meshes. + build_indexed_indirect: Option, + /// The bind group for the `build_indirect_params.wgsl` shader, for + /// non-indexed meshes. + build_non_indexed_indirect: Option, } /// Stops the `GpuPreprocessNode` attempting to generate the buffer for this view @@ -241,8 +354,20 @@ impl Plugin for GpuMeshPreprocessPlugin { ); load_internal_asset!( app, - MESH_PREPROCESS_TYPES_SHADER_HANDLE, - "mesh_preprocess_types.wgsl", + RESET_INDIRECT_BATCH_SETS_SHADER_HANDLE, + "reset_indirect_batch_sets.wgsl", + Shader::from_wgsl + ); + load_internal_asset!( + app, + BUILD_INDIRECT_PARAMS_SHADER_HANDLE, + "build_indirect_params.wgsl", + Shader::from_wgsl + ); + load_internal_asset!( + app, + BUILD_INDIRECT_PARAMS_SHADER_HANDLE, + "build_indirect_params.wgsl", Shader::from_wgsl ); load_internal_asset!( @@ -268,6 +393,7 @@ impl Plugin for GpuMeshPreprocessPlugin { render_app .init_resource::() .init_resource::>() + .init_resource::>() .init_resource::>() .add_systems( Render, @@ -279,25 +405,43 @@ impl Plugin for GpuMeshPreprocessPlugin { ) .in_set(RenderSet::PrepareBindGroups), write_mesh_culling_data_buffer.in_set(RenderSet::PrepareResourcesFlush), - ) + ), ) - .add_render_graph_node::(Core3d, NodePbr::GpuPreprocess) - .add_render_graph_node::( + .add_render_graph_node::(Core3d, NodePbr::EarlyGpuPreprocess) + .add_render_graph_node::(Core3d, NodePbr::LateGpuPreprocess) + .add_render_graph_node::( Core3d, - NodePbr::BuildIndirectParameters + NodePbr::EarlyPrepassBuildIndirectParameters, ) - .add_render_graph_edges( + .add_render_graph_node::( + Core3d, + NodePbr::LatePrepassBuildIndirectParameters, + ) + .add_render_graph_node::( Core3d, - (NodePbr::GpuPreprocess, NodePbr::BuildIndirectParameters, Node3d::Prepass) + NodePbr::MainBuildIndirectParameters, ) .add_render_graph_edges( Core3d, - (NodePbr::GpuPreprocess, NodePbr::BuildIndirectParameters, NodePbr::ShadowPass) + ( + NodePbr::EarlyGpuPreprocess, + NodePbr::EarlyPrepassBuildIndirectParameters, + Node3d::EarlyPrepass, + Node3d::EarlyDownsampleDepth, + NodePbr::LateGpuPreprocess, + NodePbr::LatePrepassBuildIndirectParameters, + Node3d::LatePrepass, + NodePbr::MainBuildIndirectParameters, + // Shadows don't currently support occlusion culling, so we + // treat shadows as effectively the main phase for our + // purposes. + NodePbr::ShadowPass, + ), ); } } -impl FromWorld for GpuPreprocessNode { +impl FromWorld for EarlyGpuPreprocessNode { fn from_world(world: &mut World) -> Self { Self { view_query: QueryState::new(world), @@ -306,7 +450,7 @@ impl FromWorld for GpuPreprocessNode { } } -impl Node for GpuPreprocessNode { +impl Node for EarlyGpuPreprocessNode { fn update(&mut self, world: &mut World) { self.view_query.update_archetypes(world); self.main_view_query.update_archetypes(world); @@ -331,7 +475,7 @@ impl Node for GpuPreprocessNode { render_context .command_encoder() .begin_compute_pass(&ComputePassDescriptor { - label: Some("mesh preprocessing"), + label: Some("early mesh preprocessing"), timestamp_writes: None, }); @@ -346,24 +490,42 @@ impl Node for GpuPreprocessNode { // Run the compute passes. for view_entity in all_views { - let Ok((view, bind_groups, view_uniform_offset, no_indirect_drawing)) = - self.view_query.get_manual(world, view_entity) + let Ok(( + view, + bind_groups, + view_uniform_offset, + no_indirect_drawing, + occlusion_culling, + )) = self.view_query.get_manual(world, view_entity) else { continue; }; + let Some(bind_groups) = bind_groups else { + continue; + }; + let Some(view_uniform_offset) = view_uniform_offset else { + continue; + }; + // Grab the work item buffers for this view. - let Some(view_work_item_buffers) = index_buffers.get(&view) else { + let Some(phase_work_item_buffers) = index_buffers.get(&view) else { warn!("The preprocessing index buffer wasn't present"); continue; }; // Select the right pipeline, depending on whether GPU culling is in // use. - let maybe_pipeline_id = if !no_indirect_drawing { - preprocess_pipelines.gpu_culling_preprocess.pipeline_id - } else { + let maybe_pipeline_id = if no_indirect_drawing { preprocess_pipelines.direct_preprocess.pipeline_id + } else if occlusion_culling { + preprocess_pipelines + .early_gpu_occlusion_culling_preprocess + .pipeline_id + } else { + preprocess_pipelines + .gpu_frustum_culling_preprocess + .pipeline_id }; // Fetch the pipeline. @@ -382,7 +544,7 @@ impl Node for GpuPreprocessNode { compute_pass.set_pipeline(preprocess_pipeline); // Loop over each render phase. - for (phase_type_id, phase_work_item_buffers) in view_work_item_buffers { + for (phase_type_id, work_item_buffers) in phase_work_item_buffers { // Fetch the bind group for the render phase. let Some(phase_bind_groups) = bind_groups.get(phase_type_id) else { continue; @@ -400,21 +562,25 @@ impl Node for GpuPreprocessNode { PhasePreprocessBindGroups::Direct(ref bind_group) => { // Invoke the mesh preprocessing shader to transform // meshes only, but not cull. - let PreprocessWorkItemBuffers::Direct(phase_work_item_buffer) = - phase_work_item_buffers + let PreprocessWorkItemBuffers::Direct(work_item_buffer) = work_item_buffers else { continue; }; compute_pass.set_bind_group(0, bind_group, &dynamic_offsets); - let workgroup_count = phase_work_item_buffer.len().div_ceil(WORKGROUP_SIZE); + let workgroup_count = work_item_buffer.len().div_ceil(WORKGROUP_SIZE); if workgroup_count > 0 { compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); } } - PhasePreprocessBindGroups::Indirect { + PhasePreprocessBindGroups::IndirectFrustumCulling { indexed: ref maybe_indexed_bind_group, non_indexed: ref maybe_non_indexed_bind_group, + } + | PhasePreprocessBindGroups::IndirectOcclusionCulling { + early_indexed: ref maybe_indexed_bind_group, + early_non_indexed: ref maybe_non_indexed_bind_group, + .. } => { // Invoke the mesh preprocessing shader to transform and // cull the meshes. @@ -422,7 +588,7 @@ impl Node for GpuPreprocessNode { indexed: indexed_buffer, non_indexed: non_indexed_buffer, .. - } = phase_work_item_buffers + } = work_item_buffers else { continue; }; @@ -457,7 +623,31 @@ impl Node for GpuPreprocessNode { } } -impl FromWorld for BuildIndirectParametersNode { +impl FromWorld for EarlyPrepassBuildIndirectParametersNode { + fn from_world(world: &mut World) -> Self { + Self { + view_query: QueryState::new(world), + } + } +} + +impl FromWorld for LatePrepassBuildIndirectParametersNode { + fn from_world(world: &mut World) -> Self { + Self { + view_query: QueryState::new(world), + } + } +} + +impl FromWorld for MainBuildIndirectParametersNode { + fn from_world(world: &mut World) -> Self { + Self { + view_query: QueryState::new(world), + } + } +} + +impl FromWorld for LateGpuPreprocessNode { fn from_world(world: &mut World) -> Self { Self { view_query: QueryState::new(world), @@ -465,7 +655,7 @@ impl FromWorld for BuildIndirectParametersNode { } } -impl Node for BuildIndirectParametersNode { +impl Node for LateGpuPreprocessNode { fn update(&mut self, world: &mut World) { self.view_query.update_archetypes(world); } @@ -476,88 +666,102 @@ impl Node for BuildIndirectParametersNode { render_context: &mut RenderContext<'w>, world: &'w World, ) -> Result<(), NodeRunError> { - // Fetch the bind group. - let Some(build_indirect_params_bind_groups) = - world.get_resource::() - else { - return Ok(()); - }; + // Grab the [`BatchedInstanceBuffers`]. + let BatchedInstanceBuffers { + ref work_item_buffers, + ref late_indexed_indirect_parameters_buffer, + ref late_non_indexed_indirect_parameters_buffer, + .. + } = world.resource::>(); - // Fetch the pipelines and the buffers we need. let pipeline_cache = world.resource::(); let preprocess_pipelines = world.resource::(); - let indirect_parameters_buffers = world.resource::(); - // Create the compute pass. let mut compute_pass = render_context .command_encoder() .begin_compute_pass(&ComputePassDescriptor { - label: Some("build indirect parameters"), + label: Some("late mesh preprocessing"), timestamp_writes: None, }); - // Fetch the pipelines. + // Run the compute passes. + for (view, bind_groups, view_uniform_offset) in self.view_query.iter_manual(world) { + // Grab the work item buffers for this view. + let Some(phase_work_item_buffers) = work_item_buffers.get(&view) else { + warn!("The preprocessing index buffer wasn't present"); + continue; + }; - let (maybe_indexed_pipeline_id, maybe_non_indexed_pipeline_id) = ( - preprocess_pipelines - .build_indexed_indirect_params - .pipeline_id, - preprocess_pipelines - .build_non_indexed_indirect_params - .pipeline_id, - ); + let maybe_pipeline_id = preprocess_pipelines + .late_gpu_occlusion_culling_preprocess + .pipeline_id; - let ( - Some(build_indexed_indirect_params_pipeline_id), - Some(build_non_indexed_indirect_params_pipeline_id), - ) = (maybe_indexed_pipeline_id, maybe_non_indexed_pipeline_id) - else { - warn!("The build indirect parameters pipelines weren't ready"); - return Ok(()); - }; + // Fetch the pipeline. + let Some(preprocess_pipeline_id) = maybe_pipeline_id else { + warn!("The build mesh uniforms pipeline wasn't ready"); + return Ok(()); + }; - let ( - Some(build_indexed_indirect_params_pipeline), - Some(build_non_indexed_indirect_params_pipeline), - ) = ( - pipeline_cache.get_compute_pipeline(build_indexed_indirect_params_pipeline_id), - pipeline_cache.get_compute_pipeline(build_non_indexed_indirect_params_pipeline_id), - ) - else { - // This will happen while the pipeline is being compiled and is fine. - return Ok(()); - }; + let Some(preprocess_pipeline) = + pipeline_cache.get_compute_pipeline(preprocess_pipeline_id) + else { + // This will happen while the pipeline is being compiled and is fine. + return Ok(()); + }; - // Transform the [`IndirectParametersMetadata`] that the GPU mesh - // preprocessing phase wrote to [`IndirectParametersIndexed`] for - // indexed meshes, if we have any. - if let Some(ref build_indirect_indexed_params_bind_group) = - build_indirect_params_bind_groups.indexed - { - compute_pass.set_pipeline(build_indexed_indirect_params_pipeline); - compute_pass.set_bind_group(0, build_indirect_indexed_params_bind_group, &[]); - let workgroup_count = indirect_parameters_buffers - .indexed_batch_count() - .div_ceil(WORKGROUP_SIZE); - if workgroup_count > 0 { - compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); - } - } + compute_pass.set_pipeline(preprocess_pipeline); - // Transform the [`IndirectParametersMetadata`] that the GPU mesh - // preprocessing phase wrote to [`IndirectParametersNonIndexed`] for - // non-indexed meshes, if we have any. - if let Some(ref build_indirect_non_indexed_params_bind_group) = - build_indirect_params_bind_groups.non_indexed - { - compute_pass.set_pipeline(build_non_indexed_indirect_params_pipeline); - compute_pass.set_bind_group(0, build_indirect_non_indexed_params_bind_group, &[]); - let workgroup_count = indirect_parameters_buffers - .non_indexed_batch_count() - .div_ceil(WORKGROUP_SIZE); - if workgroup_count > 0 { - compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); + for (phase_type_id, work_item_buffers) in phase_work_item_buffers { + let ( + PreprocessWorkItemBuffers::Indirect { + gpu_occlusion_culling: + Some(GpuOcclusionCullingWorkItemBuffers { + late_indirect_parameters_indexed_offset, + late_indirect_parameters_non_indexed_offset, + .. + }), + .. + }, + Some(PhasePreprocessBindGroups::IndirectOcclusionCulling { + late_indexed: ref late_indexed_bind_group, + late_non_indexed: ref late_non_indexed_bind_group, + .. + }), + Some(late_indexed_indirect_parameters_buffer), + Some(late_non_indexed_indirect_parameters_buffer), + ) = ( + work_item_buffers, + bind_groups.get(phase_type_id), + late_indexed_indirect_parameters_buffer.buffer(), + late_non_indexed_indirect_parameters_buffer.buffer(), + ) + else { + continue; + }; + + let mut dynamic_offsets: SmallVec<[u32; 1]> = smallvec![]; + dynamic_offsets.push(view_uniform_offset.offset); + + compute_pass.set_bind_group( + 0, + late_indexed_bind_group.as_deref(), + &dynamic_offsets, + ); + compute_pass.dispatch_workgroups_indirect( + late_indexed_indirect_parameters_buffer, + *late_indirect_parameters_indexed_offset as u64, + ); + + compute_pass.set_bind_group( + 0, + late_non_indexed_bind_group.as_deref(), + &dynamic_offsets, + ); + compute_pass.dispatch_workgroups_indirect( + late_non_indexed_indirect_parameters_buffer, + *late_indirect_parameters_non_indexed_offset as u64, + ); } } @@ -565,15 +769,221 @@ impl Node for BuildIndirectParametersNode { } } +impl Node for EarlyPrepassBuildIndirectParametersNode { + fn update(&mut self, world: &mut World) { + self.view_query.update_archetypes(world); + } + + fn run<'w>( + &self, + _: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + world: &'w World, + ) -> Result<(), NodeRunError> { + let preprocess_pipelines = world.resource::(); + + run_build_indirect_parameters_node( + render_context, + world, + &preprocess_pipelines.early_phase, + "early prepass indirect parameters building", + ) + } +} + +impl Node for LatePrepassBuildIndirectParametersNode { + fn update(&mut self, world: &mut World) { + self.view_query.update_archetypes(world); + } + + fn run<'w>( + &self, + _: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + world: &'w World, + ) -> Result<(), NodeRunError> { + let preprocess_pipelines = world.resource::(); + + run_build_indirect_parameters_node( + render_context, + world, + &preprocess_pipelines.late_phase, + "late prepass indirect parameters building", + ) + } +} + +impl Node for MainBuildIndirectParametersNode { + fn update(&mut self, world: &mut World) { + self.view_query.update_archetypes(world); + } + + fn run<'w>( + &self, + _: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + world: &'w World, + ) -> Result<(), NodeRunError> { + let preprocess_pipelines = world.resource::(); + + run_build_indirect_parameters_node( + render_context, + world, + &preprocess_pipelines.main_phase, + "main indirect parameters building", + ) + } +} + +fn run_build_indirect_parameters_node( + render_context: &mut RenderContext, + world: &World, + preprocess_phase_pipelines: &PreprocessPhasePipelines, + label: &'static str, +) -> Result<(), NodeRunError> { + let Some(build_indirect_params_bind_groups) = + world.get_resource::() + else { + return Ok(()); + }; + + let pipeline_cache = world.resource::(); + let indirect_parameters_buffers = world.resource::(); + + let mut compute_pass = + render_context + .command_encoder() + .begin_compute_pass(&ComputePassDescriptor { + label: Some(label), + timestamp_writes: None, + }); + + // Fetch the pipeline. + let ( + Some(reset_indirect_batch_sets_pipeline_id), + Some(build_indexed_indirect_params_pipeline_id), + Some(build_non_indexed_indirect_params_pipeline_id), + ) = ( + preprocess_phase_pipelines + .reset_indirect_batch_sets + .pipeline_id, + preprocess_phase_pipelines + .gpu_occlusion_culling_build_indexed_indirect_params + .pipeline_id, + preprocess_phase_pipelines + .gpu_occlusion_culling_build_non_indexed_indirect_params + .pipeline_id, + ) + else { + warn!("The build indirect parameters pipelines weren't ready"); + return Ok(()); + }; + + let ( + Some(reset_indirect_batch_sets_pipeline), + Some(build_indexed_indirect_params_pipeline), + Some(build_non_indexed_indirect_params_pipeline), + ) = ( + pipeline_cache.get_compute_pipeline(reset_indirect_batch_sets_pipeline_id), + pipeline_cache.get_compute_pipeline(build_indexed_indirect_params_pipeline_id), + pipeline_cache.get_compute_pipeline(build_non_indexed_indirect_params_pipeline_id), + ) + else { + // This will happen while the pipeline is being compiled and is fine. + return Ok(()); + }; + + // Build indexed indirect parameters. + if let ( + Some(reset_indexed_indirect_batch_sets_bind_group), + Some(build_indirect_indexed_params_bind_group), + ) = ( + &build_indirect_params_bind_groups.reset_indexed_indirect_batch_sets, + &build_indirect_params_bind_groups.build_indexed_indirect, + ) { + compute_pass.set_pipeline(reset_indirect_batch_sets_pipeline); + compute_pass.set_bind_group(0, reset_indexed_indirect_batch_sets_bind_group, &[]); + let workgroup_count = indirect_parameters_buffers + .batch_set_count(true) + .div_ceil(WORKGROUP_SIZE); + if workgroup_count > 0 { + compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); + } + + compute_pass.set_pipeline(build_indexed_indirect_params_pipeline); + compute_pass.set_bind_group(0, build_indirect_indexed_params_bind_group, &[]); + let workgroup_count = indirect_parameters_buffers + .indexed_batch_count() + .div_ceil(WORKGROUP_SIZE); + if workgroup_count > 0 { + compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); + } + } + + // Build non-indexed indirect parameters. + if let ( + Some(reset_non_indexed_indirect_batch_sets_bind_group), + Some(build_indirect_non_indexed_params_bind_group), + ) = ( + &build_indirect_params_bind_groups.reset_non_indexed_indirect_batch_sets, + &build_indirect_params_bind_groups.build_non_indexed_indirect, + ) { + compute_pass.set_pipeline(reset_indirect_batch_sets_pipeline); + compute_pass.set_bind_group(0, reset_non_indexed_indirect_batch_sets_bind_group, &[]); + let workgroup_count = indirect_parameters_buffers + .batch_set_count(false) + .div_ceil(WORKGROUP_SIZE); + if workgroup_count > 0 { + compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); + } + + compute_pass.set_pipeline(build_non_indexed_indirect_params_pipeline); + compute_pass.set_bind_group(0, build_indirect_non_indexed_params_bind_group, &[]); + let workgroup_count = indirect_parameters_buffers + .non_indexed_batch_count() + .div_ceil(WORKGROUP_SIZE); + if workgroup_count > 0 { + compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); + } + } + + Ok(()) +} + impl PreprocessPipelines { /// Returns true if the preprocessing and indirect parameters pipelines have /// been loaded or false otherwise. pub(crate) fn pipelines_are_loaded(&self, pipeline_cache: &PipelineCache) -> bool { self.direct_preprocess.is_loaded(pipeline_cache) - && self.gpu_culling_preprocess.is_loaded(pipeline_cache) - && self.build_indexed_indirect_params.is_loaded(pipeline_cache) && self - .build_non_indexed_indirect_params + .gpu_frustum_culling_preprocess + .is_loaded(pipeline_cache) + && self + .early_gpu_occlusion_culling_preprocess + .is_loaded(pipeline_cache) + && self + .late_gpu_occlusion_culling_preprocess + .is_loaded(pipeline_cache) + && self + .gpu_frustum_culling_build_indexed_indirect_params + .is_loaded(pipeline_cache) + && self + .gpu_frustum_culling_build_non_indexed_indirect_params + .is_loaded(pipeline_cache) + && self.early_phase.is_loaded(pipeline_cache) + && self.late_phase.is_loaded(pipeline_cache) + && self.main_phase.is_loaded(pipeline_cache) + } +} + +impl PreprocessPhasePipelines { + fn is_loaded(&self, pipeline_cache: &PipelineCache) -> bool { + self.reset_indirect_batch_sets.is_loaded(pipeline_cache) + && self + .gpu_occlusion_culling_build_indexed_indirect_params + .is_loaded(pipeline_cache) + && self + .gpu_occlusion_culling_build_non_indexed_indirect_params .is_loaded(pipeline_cache) } } @@ -585,6 +995,13 @@ impl PreprocessPipeline { } } +impl ResetIndirectBatchSetsPipeline { + fn is_loaded(&self, pipeline_cache: &PipelineCache) -> bool { + self.pipeline_id + .is_some_and(|pipeline_id| pipeline_cache.get_compute_pipeline(pipeline_id).is_some()) + } +} + impl BuildIndirectParametersPipeline { /// Returns true if this pipeline has been loaded into the pipeline cache or /// false otherwise. @@ -599,17 +1016,32 @@ impl SpecializedComputePipeline for PreprocessPipeline { fn specialize(&self, key: Self::Key) -> ComputePipelineDescriptor { let mut shader_defs = vec![]; - if key.contains(PreprocessPipelineKey::GPU_CULLING) { + if key.contains(PreprocessPipelineKey::FRUSTUM_CULLING) { shader_defs.push("INDIRECT".into()); shader_defs.push("FRUSTUM_CULLING".into()); } + if key.contains(PreprocessPipelineKey::OCCLUSION_CULLING) { + shader_defs.push("OCCLUSION_CULLING".into()); + if key.contains(PreprocessPipelineKey::EARLY_PHASE) { + shader_defs.push("EARLY_PHASE".into()); + } else { + shader_defs.push("LATE_PHASE".into()); + } + } ComputePipelineDescriptor { label: Some( format!( "mesh preprocessing ({})", - if key.contains(PreprocessPipelineKey::GPU_CULLING) { - "GPU culling" + if key.contains( + PreprocessPipelineKey::OCCLUSION_CULLING + | PreprocessPipelineKey::EARLY_PHASE + ) { + "early GPU occlusion culling" + } else if key.contains(PreprocessPipelineKey::OCCLUSION_CULLING) { + "late GPU occlusion culling" + } else if key.contains(PreprocessPipelineKey::FRUSTUM_CULLING) { + "GPU frustum culling" } else { "direct" } @@ -633,15 +1065,20 @@ impl FromWorld for PreprocessPipelines { // GPU culling bind group parameters are a superset of those in the CPU // culling (direct) shader. let direct_bind_group_layout_entries = preprocess_direct_bind_group_layout_entries(); - let gpu_culling_bind_group_layout_entries = preprocess_direct_bind_group_layout_entries() - .extend_sequential(( - // `indirect_parameters_metadata` - storage_buffer::(/* has_dynamic_offset= */ false), - // `mesh_culling_data` - storage_buffer_read_only::(/* has_dynamic_offset= */ false), - // `view` - uniform_buffer::(/* has_dynamic_offset= */ true), - )); + let gpu_frustum_culling_bind_group_layout_entries = gpu_culling_bind_group_layout_entries(); + let gpu_early_occlusion_culling_bind_group_layout_entries = + gpu_occlusion_culling_bind_group_layout_entries().extend_with_indices((( + 11, + storage_buffer::(/*has_dynamic_offset=*/ false), + ),)); + let gpu_late_occlusion_culling_bind_group_layout_entries = + gpu_occlusion_culling_bind_group_layout_entries(); + + let reset_indirect_batch_sets_bind_group_layout_entries = + DynamicBindGroupLayoutEntries::sequential( + ShaderStages::COMPUTE, + (storage_buffer::(false),), + ); // Indexed and non-indexed bind group parameters share all the bind // group layout entries except the final one. @@ -657,9 +1094,21 @@ impl FromWorld for PreprocessPipelines { "build mesh uniforms direct bind group layout", &direct_bind_group_layout_entries, ); - let gpu_culling_bind_group_layout = render_device.create_bind_group_layout( - "build mesh uniforms GPU culling bind group layout", - &gpu_culling_bind_group_layout_entries, + let gpu_frustum_culling_bind_group_layout = render_device.create_bind_group_layout( + "build mesh uniforms GPU frustum culling bind group layout", + &gpu_frustum_culling_bind_group_layout_entries, + ); + let gpu_early_occlusion_culling_bind_group_layout = render_device.create_bind_group_layout( + "build mesh uniforms GPU early occlusion culling bind group layout", + &gpu_early_occlusion_culling_bind_group_layout_entries, + ); + let gpu_late_occlusion_culling_bind_group_layout = render_device.create_bind_group_layout( + "build mesh uniforms GPU late occlusion culling bind group layout", + &gpu_late_occlusion_culling_bind_group_layout_entries, + ); + let reset_indirect_batch_sets_bind_group_layout = render_device.create_bind_group_layout( + "reset indirect batch sets bind group layout", + &reset_indirect_batch_sets_bind_group_layout_entries, ); let build_indexed_indirect_params_bind_group_layout = render_device .create_bind_group_layout( @@ -672,39 +1121,67 @@ impl FromWorld for PreprocessPipelines { &build_non_indexed_indirect_params_bind_group_layout_entries, ); + let preprocess_phase_pipelines = PreprocessPhasePipelines { + reset_indirect_batch_sets: ResetIndirectBatchSetsPipeline { + bind_group_layout: reset_indirect_batch_sets_bind_group_layout.clone(), + pipeline_id: None, + }, + gpu_occlusion_culling_build_indexed_indirect_params: BuildIndirectParametersPipeline { + bind_group_layout: build_indexed_indirect_params_bind_group_layout.clone(), + pipeline_id: None, + }, + gpu_occlusion_culling_build_non_indexed_indirect_params: + BuildIndirectParametersPipeline { + bind_group_layout: build_non_indexed_indirect_params_bind_group_layout.clone(), + pipeline_id: None, + }, + }; + PreprocessPipelines { direct_preprocess: PreprocessPipeline { bind_group_layout: direct_bind_group_layout, pipeline_id: None, }, - gpu_culling_preprocess: PreprocessPipeline { - bind_group_layout: gpu_culling_bind_group_layout, + gpu_frustum_culling_preprocess: PreprocessPipeline { + bind_group_layout: gpu_frustum_culling_bind_group_layout, + pipeline_id: None, + }, + early_gpu_occlusion_culling_preprocess: PreprocessPipeline { + bind_group_layout: gpu_early_occlusion_culling_bind_group_layout, pipeline_id: None, }, - build_indexed_indirect_params: BuildIndirectParametersPipeline { - bind_group_layout: build_indexed_indirect_params_bind_group_layout, + late_gpu_occlusion_culling_preprocess: PreprocessPipeline { + bind_group_layout: gpu_late_occlusion_culling_bind_group_layout, pipeline_id: None, }, - build_non_indexed_indirect_params: BuildIndirectParametersPipeline { - bind_group_layout: build_non_indexed_indirect_params_bind_group_layout, + gpu_frustum_culling_build_indexed_indirect_params: BuildIndirectParametersPipeline { + bind_group_layout: build_indexed_indirect_params_bind_group_layout.clone(), pipeline_id: None, }, + gpu_frustum_culling_build_non_indexed_indirect_params: + BuildIndirectParametersPipeline { + bind_group_layout: build_non_indexed_indirect_params_bind_group_layout.clone(), + pipeline_id: None, + }, + early_phase: preprocess_phase_pipelines.clone(), + late_phase: preprocess_phase_pipelines.clone(), + main_phase: preprocess_phase_pipelines.clone(), } } } fn preprocess_direct_bind_group_layout_entries() -> DynamicBindGroupLayoutEntries { - DynamicBindGroupLayoutEntries::sequential( + DynamicBindGroupLayoutEntries::new_with_indices( ShaderStages::COMPUTE, ( // `current_input` - storage_buffer_read_only::(false), + (3, storage_buffer_read_only::(false)), // `previous_input` - storage_buffer_read_only::(false), + (4, storage_buffer_read_only::(false)), // `indices` - storage_buffer_read_only::(false), + (5, storage_buffer_read_only::(false)), // `output` - storage_buffer::(false), + (6, storage_buffer::(false)), ), ) } @@ -712,37 +1189,103 @@ fn preprocess_direct_bind_group_layout_entries() -> DynamicBindGroupLayoutEntrie // Returns the first 3 bind group layout entries shared between all invocations // of the indirect parameters building shader. fn build_indirect_params_bind_group_layout_entries() -> DynamicBindGroupLayoutEntries { - DynamicBindGroupLayoutEntries::sequential( + DynamicBindGroupLayoutEntries::new_with_indices( ShaderStages::COMPUTE, ( - storage_buffer_read_only::(false), - storage_buffer_read_only::(false), - storage_buffer::(false), + (0, storage_buffer_read_only::(false)), + ( + 1, + storage_buffer_read_only::(false), + ), + (2, storage_buffer::(false)), ), ) } /// A system that specializes the `mesh_preprocess.wgsl` and /// `build_indirect_params.wgsl` pipelines if necessary. +fn gpu_culling_bind_group_layout_entries() -> DynamicBindGroupLayoutEntries { + // GPU culling bind group parameters are a superset of those in the CPU + // culling (direct) shader. + preprocess_direct_bind_group_layout_entries().extend_with_indices(( + // `indirect_parameters` + ( + 7, + storage_buffer::(/* has_dynamic_offset= */ false), + ), + // `mesh_culling_data` + ( + 8, + storage_buffer_read_only::(/* has_dynamic_offset= */ false), + ), + // `view` + ( + 0, + uniform_buffer::(/* has_dynamic_offset= */ true), + ), + )) +} + +fn gpu_occlusion_culling_bind_group_layout_entries() -> DynamicBindGroupLayoutEntries { + gpu_culling_bind_group_layout_entries().extend_with_indices(( + ( + 2, + uniform_buffer::(/*has_dynamic_offset=*/ false), + ), + ( + 10, + texture_2d(TextureSampleType::Float { filterable: true }), + ), + ( + 12, + storage_buffer::( + /*has_dynamic_offset=*/ false, + ), + ), + )) +} + +/// A system that specializes the `mesh_preprocess.wgsl` pipelines if necessary. pub fn prepare_preprocess_pipelines( pipeline_cache: Res, render_device: Res, mut specialized_preprocess_pipelines: ResMut>, + mut specialized_reset_indirect_batch_sets_pipelines: ResMut< + SpecializedComputePipelines, + >, mut specialized_build_indirect_parameters_pipelines: ResMut< SpecializedComputePipelines, >, - mut preprocess_pipelines: ResMut, + preprocess_pipelines: ResMut, ) { + let preprocess_pipelines = preprocess_pipelines.into_inner(); + preprocess_pipelines.direct_preprocess.prepare( &pipeline_cache, &mut specialized_preprocess_pipelines, PreprocessPipelineKey::empty(), ); - preprocess_pipelines.gpu_culling_preprocess.prepare( + preprocess_pipelines.gpu_frustum_culling_preprocess.prepare( &pipeline_cache, &mut specialized_preprocess_pipelines, - PreprocessPipelineKey::GPU_CULLING, + PreprocessPipelineKey::FRUSTUM_CULLING, ); + preprocess_pipelines + .early_gpu_occlusion_culling_preprocess + .prepare( + &pipeline_cache, + &mut specialized_preprocess_pipelines, + PreprocessPipelineKey::FRUSTUM_CULLING + | PreprocessPipelineKey::OCCLUSION_CULLING + | PreprocessPipelineKey::EARLY_PHASE, + ); + preprocess_pipelines + .late_gpu_occlusion_culling_preprocess + .prepare( + &pipeline_cache, + &mut specialized_preprocess_pipelines, + PreprocessPipelineKey::FRUSTUM_CULLING | PreprocessPipelineKey::OCCLUSION_CULLING, + ); let mut build_indirect_parameters_pipeline_key = BuildIndirectParametersPipelineKey::empty(); @@ -757,18 +1300,61 @@ pub fn prepare_preprocess_pipelines( .insert(BuildIndirectParametersPipelineKey::MULTI_DRAW_INDIRECT_COUNT_SUPPORTED); } - preprocess_pipelines.build_indexed_indirect_params.prepare( - &pipeline_cache, - &mut specialized_build_indirect_parameters_pipelines, - build_indirect_parameters_pipeline_key | BuildIndirectParametersPipelineKey::INDEXED, - ); preprocess_pipelines - .build_non_indexed_indirect_params + .gpu_frustum_culling_build_indexed_indirect_params + .prepare( + &pipeline_cache, + &mut specialized_build_indirect_parameters_pipelines, + build_indirect_parameters_pipeline_key | BuildIndirectParametersPipelineKey::INDEXED, + ); + preprocess_pipelines + .gpu_frustum_culling_build_non_indexed_indirect_params .prepare( &pipeline_cache, &mut specialized_build_indirect_parameters_pipelines, build_indirect_parameters_pipeline_key, ); + + for (preprocess_phase_pipelines, build_indirect_parameters_phase_pipeline_key) in [ + ( + &mut preprocess_pipelines.early_phase, + BuildIndirectParametersPipelineKey::EARLY_PHASE, + ), + ( + &mut preprocess_pipelines.late_phase, + BuildIndirectParametersPipelineKey::LATE_PHASE, + ), + ( + &mut preprocess_pipelines.main_phase, + BuildIndirectParametersPipelineKey::MAIN_PHASE, + ), + ] { + preprocess_phase_pipelines + .reset_indirect_batch_sets + .prepare( + &pipeline_cache, + &mut specialized_reset_indirect_batch_sets_pipelines, + ); + preprocess_phase_pipelines + .gpu_occlusion_culling_build_indexed_indirect_params + .prepare( + &pipeline_cache, + &mut specialized_build_indirect_parameters_pipelines, + build_indirect_parameters_pipeline_key + | build_indirect_parameters_phase_pipeline_key + | BuildIndirectParametersPipelineKey::INDEXED + | BuildIndirectParametersPipelineKey::OCCLUSION_CULLING, + ); + preprocess_phase_pipelines + .gpu_occlusion_culling_build_non_indexed_indirect_params + .prepare( + &pipeline_cache, + &mut specialized_build_indirect_parameters_pipelines, + build_indirect_parameters_pipeline_key + | build_indirect_parameters_phase_pipeline_key + | BuildIndirectParametersPipelineKey::OCCLUSION_CULLING, + ); + } } impl PreprocessPipeline { @@ -787,6 +1373,22 @@ impl PreprocessPipeline { } } +impl SpecializedComputePipeline for ResetIndirectBatchSetsPipeline { + type Key = (); + + fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor { + ComputePipelineDescriptor { + label: Some("reset indirect batch sets".into()), + layout: vec![self.bind_group_layout.clone()], + push_constant_ranges: vec![], + shader: RESET_INDIRECT_BATCH_SETS_SHADER_HANDLE, + shader_defs: vec![], + entry_point: "main".into(), + zero_initialize_workgroup_memory: false, + } + } +} + impl SpecializedComputePipeline for BuildIndirectParametersPipeline { type Key = BuildIndirectParametersPipelineKey; @@ -798,13 +1400,39 @@ impl SpecializedComputePipeline for BuildIndirectParametersPipeline { if key.contains(BuildIndirectParametersPipelineKey::MULTI_DRAW_INDIRECT_COUNT_SUPPORTED) { shader_defs.push("MULTI_DRAW_INDIRECT_COUNT_SUPPORTED".into()); } + if key.contains(BuildIndirectParametersPipelineKey::OCCLUSION_CULLING) { + shader_defs.push("OCCLUSION_CULLING".into()); + } + if key.contains(BuildIndirectParametersPipelineKey::EARLY_PHASE) { + shader_defs.push("EARLY_PHASE".into()); + } + if key.contains(BuildIndirectParametersPipelineKey::LATE_PHASE) { + shader_defs.push("LATE_PHASE".into()); + } + if key.contains(BuildIndirectParametersPipelineKey::MAIN_PHASE) { + shader_defs.push("MAIN_PHASE".into()); + } - ComputePipelineDescriptor { - label: if key.contains(BuildIndirectParametersPipelineKey::INDEXED) { - Some("build indexed indirect parameters".into()) + let label = format!( + "{} build {}indexed indirect parameters", + if !key.contains(BuildIndirectParametersPipelineKey::OCCLUSION_CULLING) { + "frustum culling" + } else if key.contains(BuildIndirectParametersPipelineKey::EARLY_PHASE) { + "early occlusion culling" + } else if key.contains(BuildIndirectParametersPipelineKey::LATE_PHASE) { + "late occlusion culling" } else { - Some("build non-indexed indirect parameters".into()) + "main occlusion culling" }, + if key.contains(BuildIndirectParametersPipelineKey::INDEXED) { + "" + } else { + "non-" + } + ); + + ComputePipelineDescriptor { + label: Some(label.into()), layout: vec![self.bind_group_layout.clone()], push_constant_ranges: vec![], shader: BUILD_INDIRECT_PARAMS_SHADER_HANDLE, @@ -815,6 +1443,21 @@ impl SpecializedComputePipeline for BuildIndirectParametersPipeline { } } +impl ResetIndirectBatchSetsPipeline { + fn prepare( + &mut self, + pipeline_cache: &PipelineCache, + pipelines: &mut SpecializedComputePipelines, + ) { + if self.pipeline_id.is_some() { + return; + } + + let reset_indirect_batch_sets_pipeline_id = pipelines.specialize(pipeline_cache, self, ()); + self.pipeline_id = Some(reset_indirect_batch_sets_pipeline_id); + } +} + impl BuildIndirectParametersPipeline { fn prepare( &mut self, @@ -833,55 +1476,117 @@ impl BuildIndirectParametersPipeline { /// A system that attaches the mesh uniform buffers to the bind groups for the /// variants of the mesh preprocessing compute shader. +#[expect( + clippy::too_many_arguments, + reason = "it's a system that needs a lot of arguments" +)] pub fn prepare_preprocess_bind_groups( mut commands: Commands, + view_depth_pyramids: Query<(&ViewDepthPyramid, &PreviousViewUniformOffset)>, render_device: Res, batched_instance_buffers: Res>, indirect_parameters_buffers: Res, mesh_culling_data_buffer: Res, view_uniforms: Res, + previous_view_uniforms: Res, pipelines: Res, ) { // Grab the `BatchedInstanceBuffers`. - let batched_instance_buffers = batched_instance_buffers.into_inner(); - - let Some(current_input_buffer) = batched_instance_buffers - .current_input_buffer - .buffer() - .buffer() - else { + let BatchedInstanceBuffers { + data_buffer: ref data_buffer_vec, + ref work_item_buffers, + current_input_buffer: ref current_input_buffer_vec, + previous_input_buffer: ref previous_input_buffer_vec, + ref late_indexed_indirect_parameters_buffer, + ref late_non_indexed_indirect_parameters_buffer, + } = batched_instance_buffers.into_inner(); + + let (Some(current_input_buffer), Some(previous_input_buffer), Some(data_buffer)) = ( + current_input_buffer_vec.buffer().buffer(), + previous_input_buffer_vec.buffer().buffer(), + data_buffer_vec.buffer(), + ) else { return; }; - // Keep track of whether any of the phases will be drawn indirectly. If - // they are, then we'll need bind groups for the indirect parameters - // building shader too. + // Record whether we have any meshes that are to be drawn indirectly. If we + // don't, then we can skip building indirect parameters. let mut any_indirect = false; - for (view, phase_work_item_buffers) in &batched_instance_buffers.work_item_buffers { + // Loop over each view. + for (view, phase_work_item_buffers) in work_item_buffers { let mut bind_groups = TypeIdMap::default(); + // Loop over each phase. for (&phase_id, work_item_buffers) in phase_work_item_buffers { - if let Some(bind_group) = prepare_preprocess_bind_group_for_phase( - &render_device, - &pipelines, - &view_uniforms, - &indirect_parameters_buffers, - &mesh_culling_data_buffer, - batched_instance_buffers, - work_item_buffers, - &mut any_indirect, - ) { + // Create the `PreprocessBindGroupBuilder`. + let preprocess_bind_group_builder = PreprocessBindGroupBuilder { + view: *view, + late_indexed_indirect_parameters_buffer, + late_non_indexed_indirect_parameters_buffer, + render_device: &render_device, + indirect_parameters_buffers: &indirect_parameters_buffers, + mesh_culling_data_buffer: &mesh_culling_data_buffer, + view_uniforms: &view_uniforms, + previous_view_uniforms: &previous_view_uniforms, + pipelines: &pipelines, + current_input_buffer, + previous_input_buffer, + data_buffer, + }; + + // Depending on the type of work items we have, construct the + // appropriate bind groups. + let (was_indirect, bind_group) = match *work_item_buffers { + PreprocessWorkItemBuffers::Direct(ref work_item_buffer) => ( + false, + preprocess_bind_group_builder + .create_direct_preprocess_bind_groups(work_item_buffer), + ), + + PreprocessWorkItemBuffers::Indirect { + indexed: ref indexed_work_item_buffer, + non_indexed: ref non_indexed_work_item_buffer, + gpu_occlusion_culling: Some(ref gpu_occlusion_culling_work_item_buffers), + } => ( + true, + preprocess_bind_group_builder + .create_indirect_occlusion_culling_preprocess_bind_groups( + &view_depth_pyramids, + indexed_work_item_buffer, + non_indexed_work_item_buffer, + gpu_occlusion_culling_work_item_buffers, + ), + ), + + PreprocessWorkItemBuffers::Indirect { + indexed: ref indexed_work_item_buffer, + non_indexed: ref non_indexed_work_item_buffer, + gpu_occlusion_culling: None, + } => ( + true, + preprocess_bind_group_builder + .create_indirect_frustum_culling_preprocess_bind_groups( + indexed_work_item_buffer, + non_indexed_work_item_buffer, + ), + ), + }; + + // Write that bind group in. + if let Some(bind_group) = bind_group { + any_indirect = any_indirect || was_indirect; bind_groups.insert(phase_id, bind_group); } } + // Save the bind groups. commands .entity(*view) .insert(PreprocessBindGroups(bind_groups)); } - // If any of the phases will be drawn indirectly, create the bind groups for + // Now, if there were any indirect draw commands, create the bind groups for // the indirect parameters building shader. if any_indirect { create_build_indirect_parameters_bind_groups( @@ -894,164 +1599,609 @@ pub fn prepare_preprocess_bind_groups( } } -// Creates the bind group for the GPU preprocessing shader for a single phase -// for a single view. -#[expect( - clippy::too_many_arguments, - reason = "it's a system that needs a bunch of parameters" -)] -fn prepare_preprocess_bind_group_for_phase( - render_device: &RenderDevice, - pipelines: &PreprocessPipelines, - view_uniforms: &ViewUniforms, - indirect_parameters_buffers: &IndirectParametersBuffers, - mesh_culling_data_buffer: &MeshCullingDataBuffer, - batched_instance_buffers: &BatchedInstanceBuffers, - work_item_buffers: &PreprocessWorkItemBuffers, - any_indirect: &mut bool, -) -> Option { - // Get the current input buffers. +/// A temporary structure that stores all the information needed to construct +/// bind groups for the mesh preprocessing shader. +struct PreprocessBindGroupBuilder<'a> { + /// The render-world entity corresponding to the current view. + view: Entity, + /// The indirect compute dispatch parameters buffer for indexed meshes in + /// the late prepass. + late_indexed_indirect_parameters_buffer: + &'a RawBufferVec, + /// The indirect compute dispatch parameters buffer for non-indexed meshes + /// in the late prepass. + late_non_indexed_indirect_parameters_buffer: + &'a RawBufferVec, + /// The device. + render_device: &'a RenderDevice, + /// The buffers that store indirect draw parameters. + indirect_parameters_buffers: &'a IndirectParametersBuffers, + /// The GPU buffer that stores the information needed to cull each mesh. + mesh_culling_data_buffer: &'a MeshCullingDataBuffer, + /// The GPU buffer that stores information about the view. + view_uniforms: &'a ViewUniforms, + /// The GPU buffer that stores information about the view from last frame. + previous_view_uniforms: &'a PreviousViewUniforms, + /// The pipelines for the mesh preprocessing shader. + pipelines: &'a PreprocessPipelines, + /// The GPU buffer containing the list of [`MeshInputUniform`]s for the + /// current frame. + current_input_buffer: &'a Buffer, + /// The GPU buffer containing the list of [`MeshInputUniform`]s for the + /// previous frame. + previous_input_buffer: &'a Buffer, + /// The GPU buffer containing the list of [`MeshUniform`]s for the current + /// frame. + /// + /// This is the buffer containing the mesh's final transforms that the + /// shaders will write to. + data_buffer: &'a Buffer, +} - let BatchedInstanceBuffers { - data_buffer: ref data_buffer_vec, - current_input_buffer: ref current_input_buffer_vec, - previous_input_buffer: ref previous_input_buffer_vec, - .. - } = batched_instance_buffers; +impl<'a> PreprocessBindGroupBuilder<'a> { + /// Creates the bind groups for mesh preprocessing when GPU frustum culling + /// and GPU occlusion culling are both disabled. + fn create_direct_preprocess_bind_groups( + &self, + work_item_buffer: &BufferVec, + ) -> Option { + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let work_item_buffer_size = NonZero::::try_from( + work_item_buffer.len() as u64 * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); - let current_input_buffer = current_input_buffer_vec.buffer().buffer()?; - let previous_input_buffer = previous_input_buffer_vec.buffer().buffer()?; - let data_buffer = data_buffer_vec.buffer()?; + Some(PhasePreprocessBindGroups::Direct( + self.render_device.create_bind_group( + "preprocess_direct_bind_group", + &self.pipelines.direct_preprocess.bind_group_layout, + &BindGroupEntries::sequential(( + self.current_input_buffer.as_entire_binding(), + self.previous_input_buffer.as_entire_binding(), + BindingResource::Buffer(BufferBinding { + buffer: work_item_buffer.buffer()?, + offset: 0, + size: work_item_buffer_size, + }), + self.data_buffer.as_entire_binding(), + )), + ), + )) + } - // Build the appropriate bind group, depending on whether we're drawing - // directly or indirectly. + /// Creates the bind groups for mesh preprocessing when GPU occlusion + /// culling is enabled. + fn create_indirect_occlusion_culling_preprocess_bind_groups( + &self, + view_depth_pyramids: &Query<(&ViewDepthPyramid, &PreviousViewUniformOffset)>, + indexed_work_item_buffer: &BufferVec, + non_indexed_work_item_buffer: &BufferVec, + gpu_occlusion_culling_work_item_buffers: &GpuOcclusionCullingWorkItemBuffers, + ) -> Option { + let GpuOcclusionCullingWorkItemBuffers { + late_indexed: ref late_indexed_work_item_buffer, + late_non_indexed: ref late_non_indexed_work_item_buffer, + late_indirect_parameters_indexed_offset, + late_indirect_parameters_non_indexed_offset, + } = *gpu_occlusion_culling_work_item_buffers; + + let (view_depth_pyramid, previous_view_uniform_offset) = + view_depth_pyramids.get(self.view).ok()?; + + Some(PhasePreprocessBindGroups::IndirectOcclusionCulling { + early_indexed: self.create_indirect_occlusion_culling_early_indexed_bind_group( + view_depth_pyramid, + previous_view_uniform_offset, + indexed_work_item_buffer, + late_indexed_work_item_buffer, + late_indirect_parameters_indexed_offset, + ), - match *work_item_buffers { - PreprocessWorkItemBuffers::Direct(ref work_item_buffer_vec) => { - let work_item_buffer = work_item_buffer_vec.buffer()?; + early_non_indexed: self.create_indirect_occlusion_culling_early_non_indexed_bind_group( + view_depth_pyramid, + previous_view_uniform_offset, + non_indexed_work_item_buffer, + late_non_indexed_work_item_buffer, + late_indirect_parameters_non_indexed_offset, + ), - // Don't use `as_entire_binding()` here; the shader reads the array - // length and the underlying buffer may be longer than the actual size - // of the vector. - let work_item_buffer_size = NonZero::::try_from( - work_item_buffer_vec.len() as u64 * u64::from(PreprocessWorkItem::min_size()), - ) - .ok(); + late_indexed: self.create_indirect_occlusion_culling_late_indexed_bind_group( + view_depth_pyramid, + previous_view_uniform_offset, + late_indexed_work_item_buffer, + late_indirect_parameters_indexed_offset, + ), - Some(PhasePreprocessBindGroups::Direct( - render_device.create_bind_group( - "preprocess_direct_bind_group", - &pipelines.direct_preprocess.bind_group_layout, - &BindGroupEntries::sequential(( - current_input_buffer.as_entire_binding(), - previous_input_buffer.as_entire_binding(), - BindingResource::Buffer(BufferBinding { - buffer: work_item_buffer, - offset: 0, - size: work_item_buffer_size, - }), - data_buffer.as_entire_binding(), - )), - ), - )) - } + late_non_indexed: self.create_indirect_occlusion_culling_late_non_indexed_bind_group( + view_depth_pyramid, + previous_view_uniform_offset, + late_non_indexed_work_item_buffer, + late_indirect_parameters_non_indexed_offset, + ), + }) + } - PreprocessWorkItemBuffers::Indirect { - indexed: ref indexed_buffer, - non_indexed: ref non_indexed_buffer, - } => { - // For indirect drawing, we need two separate bind groups, one for indexed meshes and one for non-indexed meshes. + /// Creates the bind group for the first phase of mesh preprocessing of + /// indexed meshes when GPU occlusion culling is enabled. + fn create_indirect_occlusion_culling_early_indexed_bind_group( + &self, + view_depth_pyramid: &ViewDepthPyramid, + previous_view_uniform_offset: &PreviousViewUniformOffset, + indexed_work_item_buffer: &BufferVec, + late_indexed_work_item_buffer: &UninitBufferVec, + late_indirect_parameters_indexed_offset: u32, + ) -> Option { + let mesh_culling_data_buffer = self.mesh_culling_data_buffer.buffer()?; + let view_uniforms_binding = self.view_uniforms.uniforms.binding()?; + let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?; + + match ( + self.indirect_parameters_buffers.indexed_metadata_buffer(), + indexed_work_item_buffer.buffer(), + late_indexed_work_item_buffer.buffer(), + self.late_indexed_indirect_parameters_buffer.buffer(), + ) { + ( + Some(indexed_metadata_buffer), + Some(indexed_work_item_gpu_buffer), + Some(late_indexed_work_item_gpu_buffer), + Some(late_indexed_indirect_parameters_buffer), + ) => { + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let indexed_work_item_buffer_size = NonZero::::try_from( + indexed_work_item_buffer.len() as u64 + * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); + + Some( + self.render_device.create_bind_group( + "preprocess_early_indexed_gpu_occlusion_culling_bind_group", + &self + .pipelines + .early_gpu_occlusion_culling_preprocess + .bind_group_layout, + &BindGroupEntries::with_indices(( + (3, self.current_input_buffer.as_entire_binding()), + (4, self.previous_input_buffer.as_entire_binding()), + ( + 5, + BindingResource::Buffer(BufferBinding { + buffer: indexed_work_item_gpu_buffer, + offset: 0, + size: indexed_work_item_buffer_size, + }), + ), + (6, self.data_buffer.as_entire_binding()), + (7, indexed_metadata_buffer.as_entire_binding()), + (8, mesh_culling_data_buffer.as_entire_binding()), + (0, view_uniforms_binding.clone()), + (10, &view_depth_pyramid.all_mips), + ( + 2, + BufferBinding { + buffer: previous_view_buffer, + offset: previous_view_uniform_offset.offset as u64, + size: NonZeroU64::new(size_of::() as u64), + }, + ), + ( + 11, + BufferBinding { + buffer: late_indexed_work_item_gpu_buffer, + offset: 0, + size: indexed_work_item_buffer_size, + }, + ), + ( + 12, + BufferBinding { + buffer: late_indexed_indirect_parameters_buffer, + offset: late_indirect_parameters_indexed_offset as u64, + size: NonZeroU64::new(size_of::< + LatePreprocessWorkItemIndirectParameters, + >() + as u64), + }, + ), + )), + ), + ) + } + _ => None, + } + } - let mesh_culling_data_buffer = mesh_culling_data_buffer.buffer()?; - let view_uniforms_binding = view_uniforms.uniforms.binding()?; + /// Creates the bind group for the first phase of mesh preprocessing of + /// non-indexed meshes when GPU occlusion culling is enabled. + fn create_indirect_occlusion_culling_early_non_indexed_bind_group( + &self, + view_depth_pyramid: &ViewDepthPyramid, + previous_view_uniform_offset: &PreviousViewUniformOffset, + non_indexed_work_item_buffer: &BufferVec, + late_non_indexed_work_item_buffer: &UninitBufferVec, + late_indirect_parameters_non_indexed_offset: u32, + ) -> Option { + let mesh_culling_data_buffer = self.mesh_culling_data_buffer.buffer()?; + let view_uniforms_binding = self.view_uniforms.uniforms.binding()?; + let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?; + + match ( + self.indirect_parameters_buffers + .non_indexed_metadata_buffer(), + non_indexed_work_item_buffer.buffer(), + late_non_indexed_work_item_buffer.buffer(), + self.late_non_indexed_indirect_parameters_buffer.buffer(), + ) { + ( + Some(non_indexed_metadata_buffer), + Some(non_indexed_work_item_gpu_buffer), + Some(late_non_indexed_work_item_buffer), + Some(late_non_indexed_indirect_parameters_buffer), + ) => { + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let non_indexed_work_item_buffer_size = NonZero::::try_from( + non_indexed_work_item_buffer.len() as u64 + * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); + + Some( + self.render_device.create_bind_group( + "preprocess_early_non_indexed_gpu_occlusion_culling_bind_group", + &self + .pipelines + .early_gpu_occlusion_culling_preprocess + .bind_group_layout, + &BindGroupEntries::with_indices(( + (3, self.current_input_buffer.as_entire_binding()), + (4, self.previous_input_buffer.as_entire_binding()), + ( + 5, + BindingResource::Buffer(BufferBinding { + buffer: non_indexed_work_item_gpu_buffer, + offset: 0, + size: non_indexed_work_item_buffer_size, + }), + ), + (6, self.data_buffer.as_entire_binding()), + (7, non_indexed_metadata_buffer.as_entire_binding()), + (8, mesh_culling_data_buffer.as_entire_binding()), + (0, view_uniforms_binding.clone()), + (10, &view_depth_pyramid.all_mips), + ( + 2, + BufferBinding { + buffer: previous_view_buffer, + offset: previous_view_uniform_offset.offset as u64, + size: NonZeroU64::new(size_of::() as u64), + }, + ), + ( + 11, + BufferBinding { + buffer: late_non_indexed_work_item_buffer, + offset: 0, + size: non_indexed_work_item_buffer_size, + }, + ), + ( + 12, + BufferBinding { + buffer: late_non_indexed_indirect_parameters_buffer, + offset: late_indirect_parameters_non_indexed_offset as u64, + size: NonZeroU64::new(size_of::< + LatePreprocessWorkItemIndirectParameters, + >() + as u64), + }, + ), + )), + ), + ) + } + _ => None, + } + } - let indexed_bind_group = match ( - indexed_buffer.buffer(), - indirect_parameters_buffers.indexed_metadata_buffer(), - ) { - ( - Some(indexed_work_item_buffer), - Some(indexed_indirect_parameters_metadata_buffer), - ) => { - // Don't use `as_entire_binding()` here; the shader reads the array - // length and the underlying buffer may be longer than the actual size - // of the vector. - let indexed_work_item_buffer_size = NonZero::::try_from( - indexed_buffer.len() as u64 * u64::from(PreprocessWorkItem::min_size()), - ) - .ok(); - Some(render_device.create_bind_group( - "preprocess_indexed_indirect_gpu_culling_bind_group", - &pipelines.gpu_culling_preprocess.bind_group_layout, - &BindGroupEntries::sequential(( - current_input_buffer.as_entire_binding(), - previous_input_buffer.as_entire_binding(), - BindingResource::Buffer(BufferBinding { - buffer: indexed_work_item_buffer, - offset: 0, - size: indexed_work_item_buffer_size, - }), - data_buffer.as_entire_binding(), - indexed_indirect_parameters_metadata_buffer.as_entire_binding(), - mesh_culling_data_buffer.as_entire_binding(), - view_uniforms_binding.clone(), + /// Creates the bind group for the second phase of mesh preprocessing of + /// indexed meshes when GPU occlusion culling is enabled. + fn create_indirect_occlusion_culling_late_indexed_bind_group( + &self, + view_depth_pyramid: &ViewDepthPyramid, + previous_view_uniform_offset: &PreviousViewUniformOffset, + late_indexed_work_item_buffer: &UninitBufferVec, + late_indirect_parameters_indexed_offset: u32, + ) -> Option { + let mesh_culling_data_buffer = self.mesh_culling_data_buffer.buffer()?; + let view_uniforms_binding = self.view_uniforms.uniforms.binding()?; + let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?; + + match ( + self.indirect_parameters_buffers.indexed_metadata_buffer(), + late_indexed_work_item_buffer.buffer(), + self.late_indexed_indirect_parameters_buffer.buffer(), + ) { + ( + Some(indexed_metadata_buffer), + Some(late_indexed_work_item_gpu_buffer), + Some(late_indexed_indirect_parameters_buffer), + ) => { + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let late_indexed_work_item_buffer_size = NonZero::::try_from( + late_indexed_work_item_buffer.len() as u64 + * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); + + Some( + self.render_device.create_bind_group( + "preprocess_late_indexed_gpu_occlusion_culling_bind_group", + &self + .pipelines + .late_gpu_occlusion_culling_preprocess + .bind_group_layout, + &BindGroupEntries::with_indices(( + (3, self.current_input_buffer.as_entire_binding()), + (4, self.previous_input_buffer.as_entire_binding()), + ( + 5, + BindingResource::Buffer(BufferBinding { + buffer: late_indexed_work_item_gpu_buffer, + offset: 0, + size: late_indexed_work_item_buffer_size, + }), + ), + (6, self.data_buffer.as_entire_binding()), + (7, indexed_metadata_buffer.as_entire_binding()), + (8, mesh_culling_data_buffer.as_entire_binding()), + (0, view_uniforms_binding.clone()), + (10, &view_depth_pyramid.all_mips), + ( + 2, + BufferBinding { + buffer: previous_view_buffer, + offset: previous_view_uniform_offset.offset as u64, + size: NonZeroU64::new(size_of::() as u64), + }, + ), + ( + 12, + BufferBinding { + buffer: late_indexed_indirect_parameters_buffer, + offset: late_indirect_parameters_indexed_offset as u64, + size: NonZeroU64::new(size_of::< + LatePreprocessWorkItemIndirectParameters, + >() + as u64), + }, + ), )), - )) - } - _ => None, - }; + ), + ) + } + _ => None, + } + } - let non_indexed_bind_group = match ( - non_indexed_buffer.buffer(), - indirect_parameters_buffers.non_indexed_metadata_buffer(), - ) { - ( - Some(non_indexed_work_item_buffer), - Some(non_indexed_indirect_parameters_metadata_buffer), - ) => { - // Don't use `as_entire_binding()` here; the shader reads the array - // length and the underlying buffer may be longer than the actual size - // of the vector. - let non_indexed_work_item_buffer_size = NonZero::::try_from( - non_indexed_buffer.len() as u64 * u64::from(PreprocessWorkItem::min_size()), - ) - .ok(); - Some(render_device.create_bind_group( - "preprocess_non_indexed_indirect_gpu_culling_bind_group", - &pipelines.gpu_culling_preprocess.bind_group_layout, - &BindGroupEntries::sequential(( - current_input_buffer.as_entire_binding(), - previous_input_buffer.as_entire_binding(), - BindingResource::Buffer(BufferBinding { - buffer: non_indexed_work_item_buffer, - offset: 0, - size: non_indexed_work_item_buffer_size, - }), - data_buffer.as_entire_binding(), - non_indexed_indirect_parameters_metadata_buffer.as_entire_binding(), - mesh_culling_data_buffer.as_entire_binding(), - view_uniforms_binding, + /// Creates the bind group for the second phase of mesh preprocessing of + /// non-indexed meshes when GPU occlusion culling is enabled. + fn create_indirect_occlusion_culling_late_non_indexed_bind_group( + &self, + view_depth_pyramid: &ViewDepthPyramid, + previous_view_uniform_offset: &PreviousViewUniformOffset, + late_non_indexed_work_item_buffer: &UninitBufferVec, + late_indirect_parameters_non_indexed_offset: u32, + ) -> Option { + let mesh_culling_data_buffer = self.mesh_culling_data_buffer.buffer()?; + let view_uniforms_binding = self.view_uniforms.uniforms.binding()?; + let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?; + + match ( + self.indirect_parameters_buffers + .non_indexed_metadata_buffer(), + late_non_indexed_work_item_buffer.buffer(), + self.late_non_indexed_indirect_parameters_buffer.buffer(), + ) { + ( + Some(non_indexed_metadata_buffer), + Some(non_indexed_work_item_gpu_buffer), + Some(late_non_indexed_indirect_parameters_buffer), + ) => { + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let non_indexed_work_item_buffer_size = NonZero::::try_from( + late_non_indexed_work_item_buffer.len() as u64 + * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); + + Some( + self.render_device.create_bind_group( + "preprocess_late_non_indexed_gpu_occlusion_culling_bind_group", + &self + .pipelines + .late_gpu_occlusion_culling_preprocess + .bind_group_layout, + &BindGroupEntries::with_indices(( + (3, self.current_input_buffer.as_entire_binding()), + (4, self.previous_input_buffer.as_entire_binding()), + ( + 5, + BindingResource::Buffer(BufferBinding { + buffer: non_indexed_work_item_gpu_buffer, + offset: 0, + size: non_indexed_work_item_buffer_size, + }), + ), + (6, self.data_buffer.as_entire_binding()), + (7, non_indexed_metadata_buffer.as_entire_binding()), + (8, mesh_culling_data_buffer.as_entire_binding()), + (0, view_uniforms_binding.clone()), + (10, &view_depth_pyramid.all_mips), + ( + 2, + BufferBinding { + buffer: previous_view_buffer, + offset: previous_view_uniform_offset.offset as u64, + size: NonZeroU64::new(size_of::() as u64), + }, + ), + ( + 12, + BufferBinding { + buffer: late_non_indexed_indirect_parameters_buffer, + offset: late_indirect_parameters_non_indexed_offset as u64, + size: NonZeroU64::new(size_of::< + LatePreprocessWorkItemIndirectParameters, + >() + as u64), + }, + ), )), - )) - } - _ => None, - }; + ), + ) + } + _ => None, + } + } - // Note that we found phases that will be drawn indirectly so that - // we remember to build the bind groups for the indirect parameter - // building shader. - *any_indirect = true; + /// Creates the bind groups for mesh preprocessing when GPU frustum culling + /// is enabled, but GPU occlusion culling is disabled. + fn create_indirect_frustum_culling_preprocess_bind_groups( + &self, + indexed_work_item_buffer: &BufferVec, + non_indexed_work_item_buffer: &BufferVec, + ) -> Option { + Some(PhasePreprocessBindGroups::IndirectFrustumCulling { + indexed: self + .create_indirect_frustum_culling_indexed_bind_group(indexed_work_item_buffer), + non_indexed: self.create_indirect_frustum_culling_non_indexed_bind_group( + non_indexed_work_item_buffer, + ), + }) + } - Some(PhasePreprocessBindGroups::Indirect { - indexed: indexed_bind_group, - non_indexed: non_indexed_bind_group, - }) + /// Creates the bind group for mesh preprocessing of indexed meshes when GPU + /// frustum culling is enabled, but GPU occlusion culling is disabled. + fn create_indirect_frustum_culling_indexed_bind_group( + &self, + indexed_work_item_buffer: &BufferVec, + ) -> Option { + let mesh_culling_data_buffer = self.mesh_culling_data_buffer.buffer()?; + let view_uniforms_binding = self.view_uniforms.uniforms.binding()?; + + match ( + self.indirect_parameters_buffers.indexed_metadata_buffer(), + indexed_work_item_buffer.buffer(), + ) { + (Some(indexed_metadata_buffer), Some(indexed_work_item_gpu_buffer)) => { + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let indexed_work_item_buffer_size = NonZero::::try_from( + indexed_work_item_buffer.len() as u64 + * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); + + Some( + self.render_device.create_bind_group( + "preprocess_gpu_indexed_frustum_culling_bind_group", + &self + .pipelines + .gpu_frustum_culling_preprocess + .bind_group_layout, + &BindGroupEntries::with_indices(( + (3, self.current_input_buffer.as_entire_binding()), + (4, self.previous_input_buffer.as_entire_binding()), + ( + 5, + BindingResource::Buffer(BufferBinding { + buffer: indexed_work_item_gpu_buffer, + offset: 0, + size: indexed_work_item_buffer_size, + }), + ), + (6, self.data_buffer.as_entire_binding()), + (7, indexed_metadata_buffer.as_entire_binding()), + (8, mesh_culling_data_buffer.as_entire_binding()), + (0, view_uniforms_binding.clone()), + )), + ), + ) + } + _ => None, + } + } + + /// Creates the bind group for mesh preprocessing of non-indexed meshes when + /// GPU frustum culling is enabled, but GPU occlusion culling is disabled. + fn create_indirect_frustum_culling_non_indexed_bind_group( + &self, + non_indexed_work_item_buffer: &BufferVec, + ) -> Option { + let mesh_culling_data_buffer = self.mesh_culling_data_buffer.buffer()?; + let view_uniforms_binding = self.view_uniforms.uniforms.binding()?; + + match ( + self.indirect_parameters_buffers + .non_indexed_metadata_buffer(), + non_indexed_work_item_buffer.buffer(), + ) { + (Some(non_indexed_metadata_buffer), Some(non_indexed_work_item_gpu_buffer)) => { + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let non_indexed_work_item_buffer_size = NonZero::::try_from( + non_indexed_work_item_buffer.len() as u64 + * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); + + Some( + self.render_device.create_bind_group( + "preprocess_gpu_non_indexed_frustum_culling_bind_group", + &self + .pipelines + .gpu_frustum_culling_preprocess + .bind_group_layout, + &BindGroupEntries::with_indices(( + (3, self.current_input_buffer.as_entire_binding()), + (4, self.previous_input_buffer.as_entire_binding()), + ( + 5, + BindingResource::Buffer(BufferBinding { + buffer: non_indexed_work_item_gpu_buffer, + offset: 0, + size: non_indexed_work_item_buffer_size, + }), + ), + (6, self.data_buffer.as_entire_binding()), + (7, non_indexed_metadata_buffer.as_entire_binding()), + (8, mesh_culling_data_buffer.as_entire_binding()), + (0, view_uniforms_binding.clone()), + )), + ), + ) + } + _ => None, } } } /// A system that creates bind groups from the indirect parameters metadata and -/// data buffers for the indirect parameter building shader. +/// data buffers for the indirect batch set reset shader and the indirect +/// parameter building shader. fn create_build_indirect_parameters_bind_groups( commands: &mut Commands, render_device: &RenderDevice, @@ -1060,7 +2210,45 @@ fn create_build_indirect_parameters_bind_groups( indirect_parameters_buffer: &IndirectParametersBuffers, ) { commands.insert_resource(BuildIndirectParametersBindGroups { - indexed: match ( + reset_indexed_indirect_batch_sets: match ( + indirect_parameters_buffer.indexed_batch_sets_buffer(), + ) { + (Some(indexed_batch_sets_buffer),) => Some( + render_device.create_bind_group( + "reset_indexed_indirect_batch_sets_bind_group", + // The early bind group is good for the main phase and late + // phase too. They bind the same buffers. + &pipelines + .early_phase + .reset_indirect_batch_sets + .bind_group_layout, + &BindGroupEntries::sequential((indexed_batch_sets_buffer.as_entire_binding(),)), + ), + ), + _ => None, + }, + + reset_non_indexed_indirect_batch_sets: match ( + indirect_parameters_buffer.non_indexed_batch_sets_buffer(), + ) { + (Some(non_indexed_batch_sets_buffer),) => Some( + render_device.create_bind_group( + "reset_non_indexed_indirect_batch_sets_bind_group", + // The early bind group is good for the main phase and late + // phase too. They bind the same buffers. + &pipelines + .early_phase + .reset_indirect_batch_sets + .bind_group_layout, + &BindGroupEntries::sequential(( + non_indexed_batch_sets_buffer.as_entire_binding(), + )), + ), + ), + _ => None, + }, + + build_indexed_indirect: match ( indirect_parameters_buffer.indexed_metadata_buffer(), indirect_parameters_buffer.indexed_data_buffer(), indirect_parameters_buffer.indexed_batch_sets_buffer(), @@ -1069,28 +2257,35 @@ fn create_build_indirect_parameters_bind_groups( Some(indexed_indirect_parameters_metadata_buffer), Some(indexed_indirect_parameters_data_buffer), Some(indexed_batch_sets_buffer), - ) => Some(render_device.create_bind_group( - "build_indexed_indirect_parameters_bind_group", - &pipelines.build_indexed_indirect_params.bind_group_layout, - &BindGroupEntries::sequential(( - current_input_buffer.as_entire_binding(), - // Don't use `as_entire_binding` here; the shader reads - // the length and `RawBufferVec` overallocates. - BufferBinding { - buffer: indexed_indirect_parameters_metadata_buffer, - offset: 0, - size: NonZeroU64::new( - indirect_parameters_buffer.indexed_batch_count() as u64 - * size_of::() as u64, - ), - }, - indexed_batch_sets_buffer.as_entire_binding(), - indexed_indirect_parameters_data_buffer.as_entire_binding(), - )), - )), + ) => Some( + render_device.create_bind_group( + "build_indexed_indirect_parameters_bind_group", + // The frustum culling bind group is good for occlusion culling + // too. They bind the same buffers. + &pipelines + .gpu_frustum_culling_build_indexed_indirect_params + .bind_group_layout, + &BindGroupEntries::sequential(( + current_input_buffer.as_entire_binding(), + // Don't use `as_entire_binding` here; the shader reads + // the length and `RawBufferVec` overallocates. + BufferBinding { + buffer: indexed_indirect_parameters_metadata_buffer, + offset: 0, + size: NonZeroU64::new( + indirect_parameters_buffer.indexed_batch_count() as u64 + * size_of::() as u64, + ), + }, + indexed_batch_sets_buffer.as_entire_binding(), + indexed_indirect_parameters_data_buffer.as_entire_binding(), + )), + ), + ), _ => None, }, - non_indexed: match ( + + build_non_indexed_indirect: match ( indirect_parameters_buffer.non_indexed_metadata_buffer(), indirect_parameters_buffer.non_indexed_data_buffer(), indirect_parameters_buffer.non_indexed_batch_sets_buffer(), @@ -1102,8 +2297,10 @@ fn create_build_indirect_parameters_bind_groups( ) => Some( render_device.create_bind_group( "build_non_indexed_indirect_parameters_bind_group", + // The frustum culling bind group is good for occlusion culling + // too. They bind the same buffers. &pipelines - .build_non_indexed_indirect_params + .gpu_frustum_culling_build_non_indexed_indirect_params .bind_group_layout, &BindGroupEntries::sequential(( current_input_buffer.as_entire_binding(), diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs index dd92ef7828eecc..2ea2e5a38a6972 100644 --- a/crates/bevy_pbr/src/render/mesh.rs +++ b/crates/bevy_pbr/src/render/mesh.rs @@ -86,6 +86,7 @@ pub const MESH_FUNCTIONS_HANDLE: Handle = Handle::weak_from_u128(6300874 pub const MESH_SHADER_HANDLE: Handle = Handle::weak_from_u128(3252377289100772450); pub const SKINNING_HANDLE: Handle = Handle::weak_from_u128(13215291596265391738); pub const MORPH_HANDLE: Handle = Handle::weak_from_u128(970982813587607345); +pub const OCCLUSION_CULLING_HANDLE: Handle = Handle::weak_from_u128(285365001154292827); /// How many textures are allowed in the view bind group layout (`@group(0)`) before /// broader compatibility with WebGL and WebGPU is at risk, due to the minimum guaranteed @@ -133,6 +134,12 @@ impl Plugin for MeshRenderPlugin { load_internal_asset!(app, MESH_SHADER_HANDLE, "mesh.wgsl", Shader::from_wgsl); load_internal_asset!(app, SKINNING_HANDLE, "skinning.wgsl", Shader::from_wgsl); load_internal_asset!(app, MORPH_HANDLE, "morph.wgsl", Shader::from_wgsl); + load_internal_asset!( + app, + OCCLUSION_CULLING_HANDLE, + "occlusion_culling.wgsl", + Shader::from_wgsl + ); if app.get_sub_app(RenderApp).is_none() { return; @@ -1245,9 +1252,10 @@ pub fn extract_meshes_for_gpu_building( mut removed_visibilities_query: Extract>, mut removed_global_transforms_query: Extract>, mut removed_meshes_query: Extract>, - cameras_query: Extract, Without)>>, + gpu_culling_query: Extract, Without)>>, ) { - let any_gpu_culling = !cameras_query.is_empty(); + let any_gpu_culling = !gpu_culling_query.is_empty(); + for render_mesh_instance_queue in render_mesh_instance_queues.iter_mut() { render_mesh_instance_queue.init(any_gpu_culling); } @@ -1749,7 +1757,8 @@ impl GetFullBatchData for MeshPipeline { Some(batch_set_index) => u32::from(batch_set_index), None => !0, }, - instance_count: 0, + early_instance_count: 0, + late_instance_count: 0, }; if indexed { diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl index df73454a3e8808..46adaf3714d92b 100644 --- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl +++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl @@ -1,14 +1,28 @@ -// GPU mesh uniform building. +// GPU mesh transforming and culling. // // This is a compute shader that expands each `MeshInputUniform` out to a full -// `MeshUniform` for each view before rendering. (Thus `MeshInputUniform` -// and `MeshUniform` are in a 1:N relationship.) It runs in parallel for all -// meshes for all views. As part of this process, the shader gathers each -// mesh's transform on the previous frame and writes it into the `MeshUniform` -// so that TAA works. +// `MeshUniform` for each view before rendering. (Thus `MeshInputUniform` and +// `MeshUniform` are in a 1:N relationship.) It runs in parallel for all meshes +// for all views. As part of this process, the shader gathers each mesh's +// transform on the previous frame and writes it into the `MeshUniform` so that +// TAA works. It also performs frustum culling and occlusion culling, if +// requested. +// +// If occlusion culling is on, this shader runs twice: once to prepare the +// meshes that were visible last frame, and once to prepare the meshes that +// weren't visible last frame but became visible this frame. The two invocations +// are known as *early mesh preprocessing* and *late mesh preprocessing* +// respectively. +#import bevy_pbr::mesh_preprocess_types::{IndirectParametersMetadata, MeshInput} #import bevy_pbr::mesh_types::{Mesh, MESH_FLAGS_NO_FRUSTUM_CULLING_BIT} -#import bevy_pbr::mesh_preprocess_types::{MeshInput, IndirectParametersMetadata} +#import bevy_pbr::mesh_view_bindings::view +#import bevy_pbr::occlusion_culling +#import bevy_pbr::prepass_bindings::previous_view_uniforms +#import bevy_pbr::view_transformations::{ + position_world_to_ndc, position_world_to_view, ndc_to_uv, view_z_to_depth_ndc, + position_world_to_prev_ndc, position_world_to_prev_view, prev_view_z_to_depth_ndc +} #import bevy_render::maths #import bevy_render::view::View @@ -36,20 +50,44 @@ struct PreprocessWorkItem { indirect_parameters_index: u32, } +// The parameters for the indirect compute dispatch for the late mesh +// preprocessing phase. +struct LatePreprocessWorkItemIndirectParameters { + // The number of workgroups we're going to dispatch. + // + // This value should always be equal to `ceil(work_item_count / 64)`. + dispatch_x: atomic, + // The number of workgroups in the Y direction; always 1. + dispatch_y: u32, + // The number of workgroups in the Z direction; always 1. + dispatch_z: u32, + // The precise number of work items. + work_item_count: atomic, + // Padding. + // + // This isn't the usual structure padding; it's needed because some hardware + // requires indirect compute dispatch parameters to be aligned on 32-byte + // boundaries. + pad_a: u32, + pad_b: u32, + pad_c: u32, + pad_d: u32, +} + // The current frame's `MeshInput`. -@group(0) @binding(0) var current_input: array; +@group(0) @binding(3) var current_input: array; // The `MeshInput` values from the previous frame. -@group(0) @binding(1) var previous_input: array; +@group(0) @binding(4) var previous_input: array; // Indices into the `MeshInput` buffer. // // There may be many indices that map to the same `MeshInput`. -@group(0) @binding(2) var work_items: array; +@group(0) @binding(5) var work_items: array; // The output array of `Mesh`es. -@group(0) @binding(3) var output: array; +@group(0) @binding(6) var output: array; #ifdef INDIRECT // The array of indirect parameters for drawcalls. -@group(0) @binding(4) var indirect_parameters_metadata: +@group(0) @binding(7) var indirect_parameters_metadata: array; #endif @@ -57,11 +95,27 @@ struct PreprocessWorkItem { // Data needed to cull the meshes. // // At the moment, this consists only of AABBs. -@group(0) @binding(5) var mesh_culling_data: array; +@group(0) @binding(8) var mesh_culling_data: array; +#endif // FRUSTUM_CULLING + +#ifdef OCCLUSION_CULLING +// A bitfield of visibility for meshes. This is indexed by the +// `PreprocessWorkItem::input_index`. A 0 indicates that the mesh isn't visible +// and a 1 indicates that the mesh is visible. +@group(0) @binding(9) var view_visibility: array>; + +@group(0) @binding(10) var depth_pyramid: texture_2d; -// The view data, including the view matrix. -@group(0) @binding(6) var view: View; +#ifdef EARLY_PHASE +@group(0) @binding(11) var late_preprocess_work_items: + array; +#endif // EARLY_PHASE +@group(0) @binding(12) var late_preprocess_work_item_indirect_parameters: + LatePreprocessWorkItemIndirectParameters; +#endif // OCCLUSION_CULLING + +#ifdef FRUSTUM_CULLING // Returns true if the view frustum intersects an oriented bounding box (OBB). // // `aabb_center.w` should be 1.0. @@ -102,20 +156,31 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { // Figure out our instance index. If this thread doesn't correspond to any // index, bail. let instance_index = global_invocation_id.x; + +#ifdef LATE_PHASE + if (instance_index >= + atomicLoad(&late_preprocess_work_item_indirect_parameters.work_item_count)) { + return; + } +#else // LATE_PHASE if (instance_index >= arrayLength(&work_items)) { return; } +#endif // Unpack the work item. let input_index = work_items[instance_index].input_index; let output_index = work_items[instance_index].output_index; let indirect_parameters_index = work_items[instance_index].indirect_parameters_index; + let view_visibility_word_index = input_index / 32u; + let view_visibility_bitmask = 1u << (input_index % 32u); + // Unpack the input matrix. let world_from_local_affine_transpose = current_input[input_index].world_from_local; let world_from_local = maths::affine3_to_square(world_from_local_affine_transpose); - // Cull if necessary. + // Frustum cull if necessary. #ifdef FRUSTUM_CULLING if ((current_input[input_index].flags & MESH_FLAGS_NO_FRUSTUM_CULLING_BIT) == 0u) { let aabb_center = mesh_culling_data[input_index].aabb_center.xyz; @@ -129,6 +194,114 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { } #endif + // Look up the previous model matrix. + let previous_input_index = current_input[input_index].previous_input_index; + var previous_world_from_local_affine_transpose: mat3x4; + if (previous_input_index == 0xffffffff) { + previous_world_from_local_affine_transpose = world_from_local_affine_transpose; + } else { + previous_world_from_local_affine_transpose = + previous_input[previous_input_index].world_from_local; + } + let previous_world_from_local = + maths::affine3_to_square(previous_world_from_local_affine_transpose); + + // Occlusion cull if necessary. This is done by calculating the screen-space + // axis-aligned bounding box (AABB) of the mesh and testing it against the + // appropriate level of the depth pyramid (a.k.a. hierarchical Z-buffer). If + // no part of the AABB is in front of the corresponding pixel quad in the + // hierarchical Z-buffer, then this mesh must be occluded, and we can skip + // rendering it. +#ifdef OCCLUSION_CULLING + let aabb_center = mesh_culling_data[input_index].aabb_center.xyz; + let aabb_half_extents = mesh_culling_data[input_index].aabb_half_extents.xyz; + + // Initialize the AABB and the maximum depth. + var aabb = vec4(0.0); + var max_depth_view = 0.0; + + // Build up the AABB by taking each corner of this mesh's OBB, transforming + // it, and updating the AABB and depth accordingly. + for (var i = 0u; i < 8u; i += 1u) { + let local_pos = aabb_center + select( + vec3(-1.0), + vec3(1.0), + vec3((i & 1) != 0, (i & 2) != 0, (i & 4) != 0) + ) * aabb_half_extents; + +#ifdef EARLY_PHASE + // If we're in the early phase, we're testing against the last frame's + // depth buffer, so we need to use the previous frame's transform. + let prev_world_pos = (previous_world_from_local * vec4(local_pos, 1.0)).xyz; + let view_pos = position_world_to_prev_view(prev_world_pos); + let ndc_pos = position_world_to_prev_ndc(prev_world_pos); +#else // EARLY_PHASE + // Otherwise, if this is the late phase, we use the current frame's + // transform. + let world_pos = (world_from_local * vec4(local_pos, 1.0)).xyz; + let view_pos = position_world_to_view(world_pos); + let ndc_pos = position_world_to_ndc(world_pos); +#endif // EARLY_PHASE + + let uv_pos = ndc_to_uv(ndc_pos.xy); + + // Update the AABB and maximum view-space depth. + if (i == 0u) { + aabb = vec4(uv_pos, uv_pos); + max_depth_view = view_pos.z; + } else { + aabb = vec4(min(aabb.xy, uv_pos), max(aabb.zw, uv_pos)); + max_depth_view = max(max_depth_view, view_pos.z); + } + } + + // Clip to the near plane to avoid the NDC depth becoming negative. +#ifdef EARLY_PHASE + max_depth_view = min(-previous_view_uniforms.clip_from_view[3][2], max_depth_view); +#else // EARLY_PHASE + max_depth_view = min(-view.clip_from_view[3][2], max_depth_view); +#endif // EARLY_PHASE + + // Figure out the depth of the occluder, and compare it to our own depth. + + let aabb_pixel_size = occlusion_culling::get_aabb_size_in_pixels(aabb, depth_pyramid); + let occluder_depth_ndc = + occlusion_culling::get_occluder_depth(aabb, aabb_pixel_size, depth_pyramid); + +#ifdef EARLY_PHASE + let max_depth_ndc = prev_view_z_to_depth_ndc(max_depth_view); +#else // EARLY_PHASE + let max_depth_ndc = view_z_to_depth_ndc(max_depth_view); +#endif + + // Are we culled out? + if (max_depth_ndc < occluder_depth_ndc) { +#ifdef EARLY_PHASE + // If this is the early phase, we need to make a note of this mesh so + // that we examine it again in the late phase. After all, it's possible + // that a mesh that was invisible last frame became visible in this + // frame, and we need to handle that. + let output_work_item_index = atomicAdd( + &late_preprocess_work_item_indirect_parameters.work_item_count, 1u); + if (output_work_item_index % 64u == 0u) { + // Our workgroup size is 64, and the indirect parameters for the + // late mesh preprocessing phase are counted in workgroups, so if + // we're the first thread in this workgroup, bump the workgroup + // count. + atomicAdd(&late_preprocess_work_item_indirect_parameters.dispatch_x, 1u); + } + + // Enqueue a work item for the late prepass phase. + late_preprocess_work_items[output_work_item_index].input_index = input_index; + late_preprocess_work_items[output_work_item_index].output_index = output_index; + late_preprocess_work_items[output_work_item_index].indirect_parameters_index = + indirect_parameters_index; +#endif // EARLY_PHASE + // This mesh is culled. Skip it. + return; + } +#endif // OCCLUSION_CULLING + // Calculate inverse transpose. let local_from_world_transpose = transpose(maths::inverse_affine3(transpose( world_from_local_affine_transpose))); @@ -139,32 +312,34 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { vec4(local_from_world_transpose[1].yz, local_from_world_transpose[2].xy)); let local_from_world_transpose_b = local_from_world_transpose[2].z; - // Look up the previous model matrix. - let previous_input_index = current_input[input_index].previous_input_index; - var previous_world_from_local: mat3x4; - if (previous_input_index == 0xffffffff) { - previous_world_from_local = world_from_local_affine_transpose; - } else { - previous_world_from_local = previous_input[previous_input_index].world_from_local; - } - // Figure out the output index. In indirect mode, this involves bumping the // instance index in the indirect parameters metadata, which // `build_indirect_params.wgsl` will use to generate the actual indirect // parameters. Otherwise, this index was directly supplied to us. #ifdef INDIRECT +#ifdef LATE_PHASE let batch_output_index = - atomicAdd(&indirect_parameters_metadata[indirect_parameters_index].instance_count, 1u); + atomicLoad(&indirect_parameters_metadata[indirect_parameters_index].early_instance_count) + + atomicAdd(&indirect_parameters_metadata[indirect_parameters_index].late_instance_count, 1u); +#else // LATE_PHASE + let batch_output_index = atomicAdd( + &indirect_parameters_metadata[indirect_parameters_index].early_instance_count, + 1u + ); +#endif // LATE_PHASE + let mesh_output_index = indirect_parameters_metadata[indirect_parameters_index].base_output_index + batch_output_index; + #else // INDIRECT let mesh_output_index = output_index; #endif // INDIRECT // Write the output. output[mesh_output_index].world_from_local = world_from_local_affine_transpose; - output[mesh_output_index].previous_world_from_local = previous_world_from_local; + output[mesh_output_index].previous_world_from_local = + previous_world_from_local_affine_transpose; output[mesh_output_index].local_from_world_transpose_a = local_from_world_transpose_a; output[mesh_output_index].local_from_world_transpose_b = local_from_world_transpose_b; output[mesh_output_index].flags = current_input[input_index].flags; diff --git a/crates/bevy_pbr/src/render/mesh_preprocess_types.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess_types.wgsl deleted file mode 100644 index 974a9d303aa6db..00000000000000 --- a/crates/bevy_pbr/src/render/mesh_preprocess_types.wgsl +++ /dev/null @@ -1,98 +0,0 @@ -// Types needed for GPU mesh uniform building. - -#define_import_path bevy_pbr::mesh_preprocess_types - -// Per-frame data that the CPU supplies to the GPU. -struct MeshInput { - // The model transform. - world_from_local: mat3x4, - // The lightmap UV rect, packed into 64 bits. - lightmap_uv_rect: vec2, - // A set of bitflags corresponding to `MeshFlags` on the Rust side. See the - // `MESH_FLAGS_` flags in `mesh_types.wgsl` for a list of these. - flags: u32, - // The index of this mesh's `MeshInput` in the `previous_input` array, if - // applicable. If not present, this is `u32::MAX`. - previous_input_index: u32, - // The index of the first vertex in the vertex slab. - first_vertex_index: u32, - // The index of the first vertex index in the index slab. - // - // If this mesh isn't indexed, this value is ignored. - first_index_index: u32, - // For indexed meshes, the number of indices that this mesh has; for - // non-indexed meshes, the number of vertices that this mesh consists of. - index_count: u32, - current_skin_index: u32, - previous_skin_index: u32, - // Low 16 bits: index of the material inside the bind group data. - // High 16 bits: index of the lightmap in the binding array. - material_and_lightmap_bind_group_slot: u32, -} - -// The `wgpu` indirect parameters structure for indexed meshes. -// -// The `build_indirect_params.wgsl` shader generates these. -struct IndirectParametersIndexed { - // The number of indices that this mesh has. - index_count: u32, - // The number of instances we are to draw. - instance_count: u32, - // The offset of the first index for this mesh in the index buffer slab. - first_index: u32, - // The offset of the first vertex for this mesh in the vertex buffer slab. - base_vertex: u32, - // The index of the first mesh instance in the `Mesh` buffer. - first_instance: u32, -} - -// The `wgpu` indirect parameters structure for non-indexed meshes. -// -// The `build_indirect_params.wgsl` shader generates these. -struct IndirectParametersNonIndexed { - // The number of vertices that this mesh has. - vertex_count: u32, - // The number of instances we are to draw. - instance_count: u32, - // The offset of the first vertex for this mesh in the vertex buffer slab. - base_vertex: u32, - // The index of the first mesh instance in the `Mesh` buffer. - first_instance: u32, -} - -// Information needed to generate the `IndirectParametersIndexed` and -// `IndirectParametersNonIndexed` draw commands. -struct IndirectParametersMetadata { - // The index of the mesh in the `MeshInput` buffer. - mesh_index: u32, - // The index of the first instance corresponding to this batch in the `Mesh` - // buffer. - base_output_index: u32, - // The index of the batch set in the `IndirectBatchSet` buffer. - batch_set_index: u32, - // The number of instances that are to be drawn. - // - // The `mesh_preprocess.wgsl` shader determines this, and the - // `build_indirect_params.wgsl` shader copies this value into the indirect - // draw command. - instance_count: atomic, -} - -// Information about each batch set. -// -// A *batch set* is a set of meshes that might be multi-drawn together. -// -// The CPU creates this structure, and the `build_indirect_params.wgsl` shader -// modifies it. If `multi_draw_indirect_count` is in use, the GPU reads this -// value when multi-drawing a batch set in order to determine how many commands -// make up the batch set. -struct IndirectBatchSet { - // The number of commands that make up this batch set. - // - // The CPU initializes this value to zero. The `build_indirect_params.wgsl` - // shader increments this value as it processes batches. - indirect_parameters_count: atomic, - // The offset of the first batch corresponding to this batch set within the - // `IndirectParametersIndexed` or `IndirectParametersNonIndexed` arrays. - indirect_parameters_base: u32, -} diff --git a/crates/bevy_pbr/src/render/occlusion_culling.wgsl b/crates/bevy_pbr/src/render/occlusion_culling.wgsl new file mode 100644 index 00000000000000..1be999cc6a588b --- /dev/null +++ b/crates/bevy_pbr/src/render/occlusion_culling.wgsl @@ -0,0 +1,30 @@ +// Occlusion culling utility functions. + +#define_import_path bevy_pbr::occlusion_culling + +fn get_aabb_size_in_pixels(aabb: vec4, depth_pyramid: texture_2d) -> vec2 { + let depth_pyramid_size_mip_0 = vec2(textureDimensions(depth_pyramid, 0)); + let aabb_width_pixels = (aabb.z - aabb.x) * depth_pyramid_size_mip_0.x; + let aabb_height_pixels = (aabb.w - aabb.y) * depth_pyramid_size_mip_0.y; + return vec2(aabb_width_pixels, aabb_height_pixels); +} + +fn get_occluder_depth( + aabb: vec4, + aabb_pixel_size: vec2, + depth_pyramid: texture_2d +) -> f32 { + let aabb_width_pixels = aabb_pixel_size.x; + let aabb_height_pixels = aabb_pixel_size.y; + + let depth_pyramid_size_mip_0 = vec2(textureDimensions(depth_pyramid, 0)); + let depth_level = max(0, i32(ceil(log2(max(aabb_width_pixels, aabb_height_pixels))))); // TODO: Naga doesn't like this being a u32 + let depth_pyramid_size = vec2(textureDimensions(depth_pyramid, depth_level)); + let aabb_top_left = vec2(aabb.xy * depth_pyramid_size); + + let depth_quad_a = textureLoad(depth_pyramid, aabb_top_left, depth_level).x; + let depth_quad_b = textureLoad(depth_pyramid, aabb_top_left + vec2(1u, 0u), depth_level).x; + let depth_quad_c = textureLoad(depth_pyramid, aabb_top_left + vec2(0u, 1u), depth_level).x; + let depth_quad_d = textureLoad(depth_pyramid, aabb_top_left + vec2(1u, 1u), depth_level).x; + return min(min(depth_quad_a, depth_quad_b), min(depth_quad_c, depth_quad_d)); +} diff --git a/crates/bevy_pbr/src/render/reset_indirect_batch_sets.wgsl b/crates/bevy_pbr/src/render/reset_indirect_batch_sets.wgsl new file mode 100644 index 00000000000000..930959472555e2 --- /dev/null +++ b/crates/bevy_pbr/src/render/reset_indirect_batch_sets.wgsl @@ -0,0 +1,25 @@ +// Resets the indirect draw counts to zero. +// +// This shader is needed because we reuse the same indirect batch set count +// buffer (i.e. the buffer that gets passed to `multi_draw_indirect_count` to +// determine how many objects to draw) between phases (early, late, and main). +// Before launching `build_indirect_params.wgsl`, we need to reinitialize the +// value to 0. + +#import bevy_pbr::mesh_preprocess_types::IndirectBatchSet + +@group(0) @binding(0) var indirect_batch_sets: array; + +@compute +@workgroup_size(64) +fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { + // Figure out our instance index. If this thread doesn't correspond to any + // index, bail. + let instance_index = global_invocation_id.x; + if (instance_index >= arrayLength(&indirect_batch_sets)) { + return; + } + + // Reset the number of batch sets to 0. + atomicStore(&indirect_batch_sets[instance_index].indirect_parameters_count, 0u); +} diff --git a/crates/bevy_pbr/src/render/view_transformations.wgsl b/crates/bevy_pbr/src/render/view_transformations.wgsl index 63ee78a0c06412..80c26d7b69b2e4 100644 --- a/crates/bevy_pbr/src/render/view_transformations.wgsl +++ b/crates/bevy_pbr/src/render/view_transformations.wgsl @@ -1,6 +1,7 @@ #define_import_path bevy_pbr::view_transformations #import bevy_pbr::mesh_view_bindings as view_bindings +#import bevy_pbr::prepass_bindings /// World space: /// +y is up @@ -93,6 +94,22 @@ fn direction_clip_to_view(clip_dir: vec4) -> vec3 { return view_dir.xyz; } +// ----------------- +// TO PREV. VIEW --- +// ----------------- + +fn position_world_to_prev_view(world_pos: vec3) -> vec3 { + let view_pos = prepass_bindings::previous_view_uniforms.view_from_world * + vec4(world_pos, 1.0); + return view_pos.xyz; +} + +fn position_world_to_prev_ndc(world_pos: vec3) -> vec3 { + let ndc_pos = prepass_bindings::previous_view_uniforms.clip_from_world * + vec4(world_pos, 1.0); + return ndc_pos.xyz / ndc_pos.w; +} + // ----------------- // TO CLIP --------- // ----------------- @@ -172,6 +189,19 @@ fn view_z_to_depth_ndc(view_z: f32) -> f32 { #endif } +fn prev_view_z_to_depth_ndc(view_z: f32) -> f32 { +#ifdef VIEW_PROJECTION_PERSPECTIVE + return -perspective_camera_near() / view_z; +#else ifdef VIEW_PROJECTION_ORTHOGRAPHIC + return prepass_bindings::previous_view_uniforms.clip_from_view[3][2] + + view_z * prepass_bindings::previous_view_uniforms.clip_from_view[2][2]; +#else + let ndc_pos = prepass_bindings::previous_view_uniforms.clip_from_view * + vec4(0.0, 0.0, view_z, 1.0); + return ndc_pos.z / ndc_pos.w; +#endif +} + // ----------------- // UV -------------- // ----------------- diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs index 2e893616f9294a..0226a40ffed622 100644 --- a/crates/bevy_render/src/batching/gpu_preprocessing.rs +++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs @@ -11,13 +11,14 @@ use bevy_ecs::{ world::{FromWorld, World}, }; use bevy_encase_derive::ShaderType; -use bevy_utils::{default, TypeIdMap}; +use bevy_utils::{default, hashbrown::hash_map::Entry, TypeIdMap}; use bytemuck::{Pod, Zeroable}; use nonmax::NonMaxU32; use tracing::error; use wgpu::{BindingResource, BufferUsages, DownlevelFlags, Features}; use crate::{ + experimental::occlusion_culling::OcclusionCulling, render_phase::{ BinnedPhaseItem, BinnedRenderPhaseBatch, BinnedRenderPhaseBatchSet, BinnedRenderPhaseBatchSets, CachedRenderPipelinePhaseItem, PhaseItemBatchSetKey as _, @@ -32,7 +33,15 @@ use crate::{ use super::{BatchMeta, GetBatchData, GetFullBatchData}; -pub struct BatchingPlugin; +#[derive(Default)] +pub struct BatchingPlugin { + /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so + /// that they can be read back to CPU. + /// + /// This is a debugging feature that may reduce performance. It primarily + /// exists for the `occlusion_culling` example. + pub allow_copies_from_indirect_parameters: bool, +} impl Plugin for BatchingPlugin { fn build(&self, app: &mut App) { @@ -41,7 +50,9 @@ impl Plugin for BatchingPlugin { }; render_app - .insert_resource(IndirectParametersBuffers::new()) + .insert_resource(IndirectParametersBuffers::new( + self.allow_copies_from_indirect_parameters, + )) .add_systems( Render, write_indirect_parameters_buffers.in_set(RenderSet::PrepareResourcesFlush), @@ -158,6 +169,11 @@ where /// data input uniform is expected to contain the index of the /// corresponding buffer data input uniform in this list. pub previous_input_buffer: InstanceInputUniformBuffer, + + pub late_indexed_indirect_parameters_buffer: + RawBufferVec, + pub late_non_indexed_indirect_parameters_buffer: + RawBufferVec, } /// Holds the GPU buffer of instance input data, which is the data about each @@ -259,6 +275,18 @@ where self.buffer.push(default()); } } + + pub fn len(&self) -> usize { + self.buffer.len() + } + + pub fn is_empty(&self) -> bool { + self.buffer.is_empty() + } + + pub fn into_buffer(self) -> RawBufferVec { + self.buffer + } } impl Default for InstanceInputUniformBuffer @@ -288,25 +316,102 @@ pub enum PreprocessWorkItemBuffers { indexed: BufferVec, /// The buffer of work items corresponding to non-indexed meshes. non_indexed: BufferVec, + gpu_occlusion_culling: Option, }, } -impl PreprocessWorkItemBuffers { - /// Creates a new set of buffers. - /// - /// `no_indirect_drawing` specifies whether we're drawing directly or - /// indirectly. - pub fn new(no_indirect_drawing: bool) -> Self { - if no_indirect_drawing { - PreprocessWorkItemBuffers::Direct(BufferVec::new(BufferUsages::STORAGE)) - } else { - PreprocessWorkItemBuffers::Indirect { - indexed: BufferVec::new(BufferUsages::STORAGE), - non_indexed: BufferVec::new(BufferUsages::STORAGE), +pub struct GpuOcclusionCullingWorkItemBuffers { + pub late_indexed: UninitBufferVec, + pub late_non_indexed: UninitBufferVec, + pub late_indirect_parameters_indexed_offset: u32, + pub late_indirect_parameters_non_indexed_offset: u32, +} + +#[derive(Clone, Copy, ShaderType, Pod, Zeroable)] +#[repr(C)] +pub struct LatePreprocessWorkItemIndirectParameters { + dispatch_x: u32, + dispatch_y: u32, + dispatch_z: u32, + work_item_count: u32, + pad_a: u32, + pad_b: u32, + pad_c: u32, + pad_d: u32, +} + +impl Default for LatePreprocessWorkItemIndirectParameters { + fn default() -> LatePreprocessWorkItemIndirectParameters { + LatePreprocessWorkItemIndirectParameters { + dispatch_x: 0, + dispatch_y: 1, + dispatch_z: 1, + work_item_count: 0, + pad_a: 0, + pad_b: 0, + pad_c: 0, + pad_d: 0, + } + } +} + +pub fn get_or_create_work_item_buffer<'a, I>( + work_item_buffers: &'a mut EntityHashMap>, + view: Entity, + no_indirect_drawing: bool, + gpu_occlusion_culling: bool, + late_indexed_indirect_parameters_buffer: &'_ mut RawBufferVec< + LatePreprocessWorkItemIndirectParameters, + >, + late_non_indexed_indirect_parameters_buffer: &'_ mut RawBufferVec< + LatePreprocessWorkItemIndirectParameters, + >, +) -> &'a mut PreprocessWorkItemBuffers +where + I: 'static, +{ + match work_item_buffers + .entry(view) + .or_default() + .entry(TypeId::of::()) + { + Entry::Occupied(occupied_entry) => occupied_entry.into_mut(), + Entry::Vacant(vacant_entry) => { + if no_indirect_drawing { + vacant_entry.insert(PreprocessWorkItemBuffers::Direct(BufferVec::new( + BufferUsages::STORAGE, + ))) + } else { + vacant_entry.insert(PreprocessWorkItemBuffers::Indirect { + indexed: BufferVec::new(BufferUsages::STORAGE), + non_indexed: BufferVec::new(BufferUsages::STORAGE), + gpu_occlusion_culling: if gpu_occlusion_culling { + let late_indirect_parameters_indexed_offset = + late_indexed_indirect_parameters_buffer + .push(LatePreprocessWorkItemIndirectParameters::default()) + * size_of::(); + let late_indirect_parameters_non_indexed_offset = + late_non_indexed_indirect_parameters_buffer + .push(LatePreprocessWorkItemIndirectParameters::default()) + * size_of::(); + Some(GpuOcclusionCullingWorkItemBuffers { + late_indexed: UninitBufferVec::new(BufferUsages::STORAGE), + late_non_indexed: UninitBufferVec::new(BufferUsages::STORAGE), + late_indirect_parameters_indexed_offset: + late_indirect_parameters_indexed_offset as u32, + late_indirect_parameters_non_indexed_offset: + late_indirect_parameters_non_indexed_offset as u32, + }) + } else { + None + }, + }) } } } +} +impl PreprocessWorkItemBuffers { /// Adds a new work item to the appropriate buffer. /// /// `indexed` specifies whether the work item corresponds to an indexed @@ -319,12 +424,21 @@ impl PreprocessWorkItemBuffers { PreprocessWorkItemBuffers::Indirect { indexed: ref mut indexed_buffer, non_indexed: ref mut non_indexed_buffer, + ref mut gpu_occlusion_culling, } => { if indexed { indexed_buffer.push(preprocess_work_item); } else { non_indexed_buffer.push(preprocess_work_item); } + + if let Some(ref mut gpu_occlusion_culling) = *gpu_occlusion_culling { + if indexed { + gpu_occlusion_culling.late_indexed.add(); + } else { + gpu_occlusion_culling.late_non_indexed.add(); + } + } } } } @@ -352,7 +466,7 @@ pub struct PreprocessWorkItem { /// /// This is the variant for indexed meshes. We generate the instances of this /// structure in the `build_indirect_params.wgsl` compute shader. -#[derive(Clone, Copy, Pod, Zeroable, ShaderType)] +#[derive(Clone, Copy, Debug, Pod, Zeroable, ShaderType)] #[repr(C)] pub struct IndirectParametersIndexed { /// The number of indices that this mesh has. @@ -371,7 +485,7 @@ pub struct IndirectParametersIndexed { /// /// This is the variant for non-indexed meshes. We generate the instances of /// this structure in the `build_indirect_params.wgsl` compute shader. -#[derive(Clone, Copy, Pod, Zeroable, ShaderType)] +#[derive(Clone, Copy, Debug, Pod, Zeroable, ShaderType)] #[repr(C)] pub struct IndirectParametersNonIndexed { /// The number of vertices that this mesh has. @@ -422,7 +536,8 @@ pub struct IndirectParametersMetadata { /// /// The CPU sets this value to 0, and the GPU mesh preprocessing shader /// increments it as it culls mesh instances. - pub instance_count: u32, + pub early_instance_count: u32, + pub late_instance_count: u32, } /// A structure, shared between CPU and GPU, that holds the number of on-GPU @@ -519,16 +634,19 @@ pub struct IndirectParametersBuffers { impl IndirectParametersBuffers { /// Creates the indirect parameters buffers. - pub fn new() -> IndirectParametersBuffers { + pub fn new(allow_copies_from_indirect_parameter_buffers: bool) -> IndirectParametersBuffers { + let mut indirect_parameter_buffer_usages = BufferUsages::STORAGE | BufferUsages::INDIRECT; + if allow_copies_from_indirect_parameter_buffers { + indirect_parameter_buffer_usages |= BufferUsages::COPY_SRC; + } + IndirectParametersBuffers { - non_indexed_data: UninitBufferVec::new(BufferUsages::STORAGE | BufferUsages::INDIRECT), + non_indexed_data: UninitBufferVec::new(indirect_parameter_buffer_usages), non_indexed_metadata: RawBufferVec::new(BufferUsages::STORAGE), - non_indexed_batch_sets: RawBufferVec::new( - BufferUsages::STORAGE | BufferUsages::INDIRECT, - ), - indexed_data: UninitBufferVec::new(BufferUsages::STORAGE | BufferUsages::INDIRECT), + non_indexed_batch_sets: RawBufferVec::new(indirect_parameter_buffer_usages), + indexed_data: UninitBufferVec::new(indirect_parameter_buffer_usages), indexed_metadata: RawBufferVec::new(BufferUsages::STORAGE), - indexed_batch_sets: RawBufferVec::new(BufferUsages::STORAGE | BufferUsages::INDIRECT), + indexed_batch_sets: RawBufferVec::new(indirect_parameter_buffer_usages), } } @@ -618,7 +736,7 @@ impl IndirectParametersBuffers { /// /// This allocates in both the [`Self::non_indexed_metadata`] and /// [`Self::non_indexed_data`] buffers. - fn allocate_non_indexed(&mut self, count: u32) -> u32 { + pub fn allocate_non_indexed(&mut self, count: u32) -> u32 { let length = self.non_indexed_data.len(); self.non_indexed_metadata.reserve_internal(count as usize); for _ in 0..count { @@ -711,11 +829,17 @@ impl IndirectParametersBuffers { }); } } + + pub fn get_next_batch_set_index(&self, indexed: bool) -> Option { + NonMaxU32::new(self.batch_set_count(indexed) as u32) + } } impl Default for IndirectParametersBuffers { fn default() -> Self { - Self::new() + // By default, we don't allow GPU indirect parameter mapping, since + // that's a debugging option. + Self::new(false) } } @@ -754,7 +878,7 @@ impl FromWorld for GpuPreprocessingSupport { impl BatchedInstanceBuffers where BD: GpuArrayBufferable + Sync + Send + 'static, - BDI: Pod + Default, + BDI: Pod + Sync + Send + Default + 'static, { /// Creates new buffers. pub fn new() -> Self { @@ -763,6 +887,12 @@ where work_item_buffers: EntityHashMap::default(), current_input_buffer: InstanceInputUniformBuffer::new(), previous_input_buffer: InstanceInputUniformBuffer::new(), + late_indexed_indirect_parameters_buffer: RawBufferVec::new( + BufferUsages::STORAGE | BufferUsages::INDIRECT, + ), + late_non_indexed_indirect_parameters_buffer: RawBufferVec::new( + BufferUsages::STORAGE | BufferUsages::INDIRECT, + ), } } @@ -778,28 +908,16 @@ where /// Clears out the buffers in preparation for a new frame. pub fn clear(&mut self) { self.data_buffer.clear(); - - for view_work_item_buffers in self.work_item_buffers.values_mut() { - for phase_work_item_buffers in view_work_item_buffers.values_mut() { - match *phase_work_item_buffers { - PreprocessWorkItemBuffers::Direct(ref mut buffer_vec) => buffer_vec.clear(), - PreprocessWorkItemBuffers::Indirect { - ref mut indexed, - ref mut non_indexed, - } => { - indexed.clear(); - non_indexed.clear(); - } - } - } - } + self.late_indexed_indirect_parameters_buffer.clear(); + self.late_non_indexed_indirect_parameters_buffer.clear(); + self.work_item_buffers.clear(); } } impl Default for BatchedInstanceBuffers where BD: GpuArrayBufferable + Sync + Send + 'static, - BDI: Pod + Default, + BDI: Pod + Default + Sync + Send + 'static, { fn default() -> Self { Self::new() @@ -855,9 +973,14 @@ where let (batch_range, batch_extra_index) = phase.items[self.phase_item_start_index as usize].batch_range_and_extra_index_mut(); *batch_range = self.instance_start_index..instance_end_index; - *batch_extra_index = - PhaseItemExtraIndex::maybe_indirect_parameters_index(self.indirect_parameters_index); - + *batch_extra_index = match self.indirect_parameters_index { + Some(indirect_parameters_index) => PhaseItemExtraIndex::IndirectParametersIndex { + range: u32::from(indirect_parameters_index) + ..(u32::from(indirect_parameters_index) + 1), + batch_set_index: None, + }, + None => PhaseItemExtraIndex::None, + }; if let Some(indirect_parameters_index) = self.indirect_parameters_index { indirect_parameters_buffers .add_batch_set(self.indexed, indirect_parameters_index.into()); @@ -909,7 +1032,12 @@ pub fn batch_and_prepare_sorted_render_phase( gpu_array_buffer: ResMut>, mut indirect_parameters_buffers: ResMut, mut sorted_render_phases: ResMut>, - mut views: Query<(Entity, &ExtractedView, Has)>, + mut views: Query<( + Entity, + &ExtractedView, + Has, + Has, + )>, system_param_item: StaticSystemParam, ) where I: CachedRenderPipelinePhaseItem + SortedPhaseItem, @@ -919,20 +1047,25 @@ pub fn batch_and_prepare_sorted_render_phase( let BatchedInstanceBuffers { ref mut data_buffer, ref mut work_item_buffers, + ref mut late_indexed_indirect_parameters_buffer, + ref mut late_non_indexed_indirect_parameters_buffer, .. } = gpu_array_buffer.into_inner(); - for (view, extracted_view, no_indirect_drawing) in &mut views { + for (view, extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views { let Some(phase) = sorted_render_phases.get_mut(&extracted_view.retained_view_entity) else { continue; }; // Create the work item buffer if necessary. - let work_item_buffer = work_item_buffers - .entry(view) - .or_insert_with(TypeIdMap::default) - .entry(TypeId::of::()) - .or_insert_with(|| PreprocessWorkItemBuffers::new(no_indirect_drawing)); + let work_item_buffer = get_or_create_work_item_buffer::( + work_item_buffers, + view, + no_indirect_drawing, + gpu_occlusion_culling, + late_indexed_indirect_parameters_buffer, + late_non_indexed_indirect_parameters_buffer, + ); // Walk through the list of phase items, building up batches as we go. let mut batch: Option> = None; @@ -1055,7 +1188,15 @@ pub fn batch_and_prepare_binned_render_phase( gpu_array_buffer: ResMut>, mut indirect_parameters_buffers: ResMut, mut binned_render_phases: ResMut>, - mut views: Query<(Entity, &ExtractedView, Has)>, + mut views: Query< + ( + Entity, + &ExtractedView, + Has, + Has, + ), + With, + >, param: StaticSystemParam, ) where BPI: BinnedPhaseItem, @@ -1066,21 +1207,26 @@ pub fn batch_and_prepare_binned_render_phase( let BatchedInstanceBuffers { ref mut data_buffer, ref mut work_item_buffers, + ref mut late_indexed_indirect_parameters_buffer, + ref mut late_non_indexed_indirect_parameters_buffer, .. } = gpu_array_buffer.into_inner(); - for (view, extracted_view, no_indirect_drawing) in &mut views { + for (view, extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views { let Some(phase) = binned_render_phases.get_mut(&extracted_view.retained_view_entity) else { continue; }; // Create the work item buffer if necessary; otherwise, just mark it as // used this frame. - let work_item_buffer = work_item_buffers - .entry(view) - .or_insert_with(TypeIdMap::default) - .entry(TypeId::of::()) - .or_insert_with(|| PreprocessWorkItemBuffers::new(no_indirect_drawing)); + let work_item_buffer = get_or_create_work_item_buffer::( + work_item_buffers, + view, + no_indirect_drawing, + gpu_occlusion_culling, + late_indexed_indirect_parameters_buffer, + late_non_indexed_indirect_parameters_buffer, + ); // Prepare multidrawables. @@ -1124,10 +1270,9 @@ pub fn batch_and_prepare_binned_render_phase( // Start a new batch, in indirect mode. let indirect_parameters_index = indirect_parameters_buffers.allocate(batch_set_key.indexed(), 1); - let batch_set_index = NonMaxU32::new( - indirect_parameters_buffers.batch_set_count(batch_set_key.indexed()) - as u32, - ); + let batch_set_index = indirect_parameters_buffers + .get_next_batch_set_index(batch_set_key.indexed()); + GFBD::write_batch_indirect_parameters_metadata( input_index.into(), batch_set_key.indexed(), @@ -1233,9 +1378,9 @@ pub fn batch_and_prepare_binned_render_phase( // Start a new batch, in indirect mode. let indirect_parameters_index = indirect_parameters_buffers.allocate(key.0.indexed(), 1); - let batch_set_index = NonMaxU32::new( - indirect_parameters_buffers.batch_set_count(key.0.indexed()) as u32, - ); + let batch_set_index = + indirect_parameters_buffers.get_next_batch_set_index(key.0.indexed()); + GFBD::write_batch_indirect_parameters_metadata( input_index.into(), key.0.indexed(), @@ -1255,9 +1400,10 @@ pub fn batch_and_prepare_binned_render_phase( batch = Some(BinnedRenderPhaseBatch { representative_entity: (entity, main_entity), instance_range: output_index..output_index + 1, - extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index( - NonMaxU32::new(indirect_parameters_index), - ), + extra_index: PhaseItemExtraIndex::IndirectParametersIndex { + range: indirect_parameters_index..(indirect_parameters_index + 1), + batch_set_index: None, + }, }); } @@ -1395,6 +1541,8 @@ pub fn write_batched_instance_buffers( ref mut work_item_buffers, ref mut current_input_buffer, ref mut previous_input_buffer, + ref mut late_indexed_indirect_parameters_buffer, + ref mut late_non_indexed_indirect_parameters_buffer, } = gpu_array_buffer.into_inner(); data_buffer.write_buffer(&render_device); @@ -1404,6 +1552,8 @@ pub fn write_batched_instance_buffers( previous_input_buffer .buffer .write_buffer(&render_device, &render_queue); + late_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue); + late_non_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue); for view_work_item_buffers in work_item_buffers.values_mut() { for phase_work_item_buffers in view_work_item_buffers.values_mut() { @@ -1414,9 +1564,27 @@ pub fn write_batched_instance_buffers( PreprocessWorkItemBuffers::Indirect { ref mut indexed, ref mut non_indexed, + ref mut gpu_occlusion_culling, } => { indexed.write_buffer(&render_device, &render_queue); non_indexed.write_buffer(&render_device, &render_queue); + + if let Some(GpuOcclusionCullingWorkItemBuffers { + ref mut late_indexed, + ref mut late_non_indexed, + late_indirect_parameters_indexed_offset: _, + late_indirect_parameters_non_indexed_offset: _, + }) = *gpu_occlusion_culling + { + if late_indexed.is_empty() { + late_indexed.add(); + } + if late_non_indexed.is_empty() { + late_non_indexed.add(); + } + late_indexed.write_buffer(&render_device); + late_non_indexed.write_buffer(&render_device); + } } } } diff --git a/crates/bevy_render/src/experimental/mod.rs b/crates/bevy_render/src/experimental/mod.rs new file mode 100644 index 00000000000000..40bb6cf1dcc4db --- /dev/null +++ b/crates/bevy_render/src/experimental/mod.rs @@ -0,0 +1,6 @@ +//! Experimental rendering features. +//! +//! Experimental features are features with known problems, but are included +//! nonetheless for testing purposes. + +pub mod occlusion_culling; diff --git a/crates/bevy_render/src/experimental/occlusion_culling/mesh_preprocess_types.wgsl b/crates/bevy_render/src/experimental/occlusion_culling/mesh_preprocess_types.wgsl new file mode 100644 index 00000000000000..7f4dd71f610f04 --- /dev/null +++ b/crates/bevy_render/src/experimental/occlusion_culling/mesh_preprocess_types.wgsl @@ -0,0 +1,60 @@ +// Types needed for GPU mesh uniform building. + +#define_import_path bevy_pbr::mesh_preprocess_types + +// Per-frame data that the CPU supplies to the GPU. +struct MeshInput { + // The model transform. + world_from_local: mat3x4, + // The lightmap UV rect, packed into 64 bits. + lightmap_uv_rect: vec2, + // Various flags. + flags: u32, + previous_input_index: u32, + first_vertex_index: u32, + first_index_index: u32, + index_count: u32, + current_skin_index: u32, + previous_skin_index: u32, + // Low 16 bits: index of the material inside the bind group data. + // High 16 bits: index of the lightmap in the binding array. + material_and_lightmap_bind_group_slot: u32, + pad_a: u32, + pad_b: u32, +} + +// The `wgpu` indirect parameters structure. This is a union of two structures. +// For more information, see the corresponding comment in +// `gpu_preprocessing.rs`. +struct IndirectParametersIndexed { + // `vertex_count` or `index_count`. + index_count: u32, + // `instance_count` in both structures. + instance_count: u32, + // `first_vertex` or `first_index`. + first_index: u32, + // `base_vertex` or `first_instance`. + base_vertex: u32, + // A read-only copy of `instance_index`. + first_instance: u32, +} + +struct IndirectParametersNonIndexed { + vertex_count: u32, + instance_count: u32, + base_vertex: u32, + first_instance: u32, +} + +struct IndirectParametersMetadata { + mesh_index: u32, + base_output_index: u32, + batch_set_index: u32, + early_instance_count: atomic, + late_instance_count: atomic, +} + +struct IndirectBatchSet { + indirect_parameters_count: atomic, + indirect_parameters_base: u32, +} diff --git a/crates/bevy_render/src/experimental/occlusion_culling/mod.rs b/crates/bevy_render/src/experimental/occlusion_culling/mod.rs new file mode 100644 index 00000000000000..d9e26bc181ee1f --- /dev/null +++ b/crates/bevy_render/src/experimental/occlusion_culling/mod.rs @@ -0,0 +1,84 @@ +//! GPU occlusion culling. +//! +//! See [`OcclusionCulling`] for a detailed description of occlusion culling in +//! Bevy. + +use bevy_app::{App, Plugin}; +use bevy_asset::{load_internal_asset, Handle}; +use bevy_ecs::{component::Component, prelude::ReflectComponent}; +use bevy_reflect::{prelude::ReflectDefault, Reflect}; + +use crate::{extract_component::ExtractComponent, render_resource::Shader}; + +/// The handle to the `mesh_preprocess_types.wgsl` compute shader. +pub const MESH_PREPROCESS_TYPES_SHADER_HANDLE: Handle = + Handle::weak_from_u128(2720440370122465935); + +/// Enables GPU occlusion culling. +/// +/// See [`OcclusionCulling`] for a detailed description of occlusion culling in +/// Bevy. +pub struct OcclusionCullingPlugin; + +impl Plugin for OcclusionCullingPlugin { + fn build(&self, app: &mut App) { + load_internal_asset!( + app, + MESH_PREPROCESS_TYPES_SHADER_HANDLE, + "mesh_preprocess_types.wgsl", + Shader::from_wgsl + ); + } +} + +/// Add this component to a view in order to enable experimental GPU occlusion +/// culling. +/// +/// *Bevy's occlusion culling is currently marked as experimental.* There are +/// known issues whereby, in rare circumstances, occlusion culling can result in +/// meshes being culled that shouldn't be (i.e. meshes that turn invisible). +/// Please try it out and report issues. +/// +/// *Occlusion culling* allows Bevy to avoid rendering objects that are fully +/// behind other opaque or alpha tested objects. This is different from, and +/// complements, depth fragment rejection as the `DepthPrepass` enables. While +/// depth rejection allows Bevy to avoid rendering *pixels* that are behind +/// other objects, the GPU still has to examine those pixels to reject them, +/// which requires transforming the vertices of the objects and performing +/// skinning if the objects were skinned. Occlusion culling allows the GPU to go +/// a step further, avoiding even transforming the vertices of objects that it +/// can quickly prove to be behind other objects. +/// +/// Occlusion culling inherently has some overhead, because Bevy must examine +/// the objects' bounding boxes, and create an acceleration structure +/// (hierarchical Z-buffer) to perform the occlusion tests. Therefore, occlusion +/// culling is disabled by default. Only enable it if you measure it to be a +/// speedup on your scene. Note that, because Bevy's occlusion culling runs on +/// the GPU and is quite efficient, it's rare for occlusion culling to result in +/// a significant slowdown. +/// +/// Occlusion culling currently requires a `DepthPrepass`. If no depth prepass +/// is present on the view, the [`OcclusionCulling`] component will be ignored. +/// +/// The algorithm that Bevy uses is known as [*two-phase occlusion culling*]. +/// When you enable occlusion culling, Bevy splits the depth prepass into two: +/// an *early* depth prepass and a *late* depth prepass. The early depth prepass +/// renders all the meshes that were visible last frame to produce a +/// conservative approximation of the depth buffer. Then, after producing an +/// acceleration structure known as a hierarchical Z-buffer or depth pyramid, +/// Bevy tests the bounding boxes of all meshes against that depth buffer. Those +/// that can be quickly proven to be behind the geometry rendered during the +/// early depth prepass are skipped entirely. The other potentially-visible +/// meshes are rendered during the late prepass, and finally all the visible +/// meshes are rendered as usual during the opaque, transparent, etc. passes. +/// +/// Unlike other occlusion culling systems you may be familiar with, Bevy's +/// occlusion culling is fully dynamic and requires no baking step. The CPU +/// overhead is minimal. Large skinned meshes and other dynamic objects can +/// occlude other objects. +/// +/// [*two-phase occlusion culling*]: +/// https://medium.com/@mil_kru/two-pass-occlusion-culling-4100edcad501 +#[derive(Component, ExtractComponent, Clone, Copy, Default, Reflect)] +#[reflect(Component, Default)] +pub struct OcclusionCulling; diff --git a/crates/bevy_render/src/lib.rs b/crates/bevy_render/src/lib.rs index 32bc6d0305278e..849dec0ce4f51c 100644 --- a/crates/bevy_render/src/lib.rs +++ b/crates/bevy_render/src/lib.rs @@ -23,6 +23,7 @@ pub mod alpha; pub mod batching; pub mod camera; pub mod diagnostic; +pub mod experimental; pub mod extract_component; pub mod extract_instances; mod extract_param; @@ -74,6 +75,7 @@ pub use extract_param::Extract; use bevy_hierarchy::ValidParentCheckPlugin; use bevy_window::{PrimaryWindow, RawHandleWrapperHolder}; +use experimental::occlusion_culling::OcclusionCullingPlugin; use extract_resource::ExtractResourcePlugin; use globals::GlobalsPlugin; use render_asset::RenderAssetBytesPerFrame; @@ -116,6 +118,12 @@ pub struct RenderPlugin { /// If `true`, disables asynchronous pipeline compilation. /// This has no effect on macOS, Wasm, iOS, or without the `multi_threaded` feature. pub synchronous_pipeline_compilation: bool, + /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so + /// that they can be read back to CPU. + /// + /// This is a debugging feature that may reduce performance. It primarily + /// exists for the `occlusion_culling` example. + pub allow_copies_from_indirect_parameters: bool, } /// The systems sets of the default [`App`] rendering schedule. @@ -356,10 +364,13 @@ impl Plugin for RenderPlugin { MeshPlugin, GlobalsPlugin, MorphPlugin, - BatchingPlugin, + BatchingPlugin { + allow_copies_from_indirect_parameters: self.allow_copies_from_indirect_parameters, + }, SyncWorldPlugin, StoragePlugin, GpuReadbackPlugin::default(), + OcclusionCullingPlugin, )); app.init_resource::() diff --git a/crates/bevy_render/src/render_resource/buffer_vec.rs b/crates/bevy_render/src/render_resource/buffer_vec.rs index 8191671c15bb85..d3f16ac3413fd5 100644 --- a/crates/bevy_render/src/render_resource/buffer_vec.rs +++ b/crates/bevy_render/src/render_resource/buffer_vec.rs @@ -202,6 +202,18 @@ impl RawBufferVec { } } +impl RawBufferVec +where + T: NoUninit + Default, +{ + pub fn grow_set(&mut self, index: u32, value: T) { + while index as usize + 1 > self.len() { + self.values.push(T::default()); + } + self.values[index as usize] = value; + } +} + impl Extend for RawBufferVec { #[inline] fn extend>(&mut self, iter: I) { diff --git a/crates/bevy_render/src/view/mod.rs b/crates/bevy_render/src/view/mod.rs index b5ad7a541e2975..5b6ab63a067fe2 100644 --- a/crates/bevy_render/src/view/mod.rs +++ b/crates/bevy_render/src/view/mod.rs @@ -10,6 +10,7 @@ use crate::{ CameraMainTextureUsages, ClearColor, ClearColorConfig, Exposure, ExtractedCamera, ManualTextureViews, MipBias, NormalizedRenderTarget, TemporalJitter, }, + experimental::occlusion_culling::OcclusionCulling, extract_component::ExtractComponentPlugin, prelude::Shader, primitives::Frustum, @@ -109,9 +110,11 @@ impl Plugin for ViewPlugin { .register_type::() .register_type::() .register_type::() + .register_type::() // NOTE: windows.is_changed() handles cases where a window was resized .add_plugins(( ExtractComponentPlugin::::default(), + ExtractComponentPlugin::::default(), VisibilityPlugin, VisibilityRangePlugin, )); diff --git a/crates/bevy_sprite/src/mesh2d/material.rs b/crates/bevy_sprite/src/mesh2d/material.rs index a4dfc376682c45..9d8737bfc2c938 100644 --- a/crates/bevy_sprite/src/mesh2d/material.rs +++ b/crates/bevy_sprite/src/mesh2d/material.rs @@ -17,6 +17,7 @@ use bevy_ecs::{ }; use bevy_math::FloatOrd; use bevy_reflect::{prelude::ReflectDefault, Reflect}; +use bevy_render::view::RenderVisibleEntities; use bevy_render::{ mesh::{MeshVertexBufferLayoutRef, RenderMesh}, render_asset::{ @@ -33,7 +34,7 @@ use bevy_render::{ SpecializedMeshPipelineError, SpecializedMeshPipelines, }, renderer::RenderDevice, - view::{ExtractedView, Msaa, RenderVisibleEntities, ViewVisibility}, + view::{ExtractedView, Msaa, ViewVisibility}, Extract, ExtractSchedule, Render, RenderApp, RenderSet, }; use bevy_render::{render_resource::BindingResources, sync_world::MainEntityHashMap}; diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs index 52309fd492aab2..86d02d5680b8be 100644 --- a/crates/bevy_sprite/src/mesh2d/mesh.rs +++ b/crates/bevy_sprite/src/mesh2d/mesh.rs @@ -421,7 +421,8 @@ impl GetFullBatchData for Mesh2dPipeline { None => !0, Some(batch_set_index) => u32::from(batch_set_index), }, - instance_count: 0, + early_instance_count: 0, + late_instance_count: 0, }; if indexed { diff --git a/examples/3d/occlusion_culling.rs b/examples/3d/occlusion_culling.rs new file mode 100644 index 00000000000000..c25b30d427ce8a --- /dev/null +++ b/examples/3d/occlusion_culling.rs @@ -0,0 +1,587 @@ +//! Demonstrates occlusion culling. +//! +//! This demo rotates many small cubes around a rotating large cube at the +//! origin. At all times, the large cube will be occluding several of the small +//! cubes. The demo displays the number of cubes that were actually rendered, so +//! the effects of occlusion culling can be seen. + +use std::{ + f32::consts::PI, + result::Result, + sync::{Arc, Mutex}, +}; + +use bevy::{ + color::palettes::css::{SILVER, WHITE}, + core_pipeline::{ + core_3d::graph::{Core3d, Node3d}, + prepass::DepthPrepass, + }, + prelude::*, + render::{ + batching::gpu_preprocessing::{IndirectParametersBuffers, IndirectParametersIndexed}, + experimental::occlusion_culling::OcclusionCulling, + render_graph::{self, NodeRunError, RenderGraphApp, RenderGraphContext, RenderLabel}, + render_resource::{Buffer, BufferDescriptor, BufferUsages, MapMode}, + renderer::{RenderContext, RenderDevice}, + Render, RenderApp, RenderPlugin, RenderSet, + }, +}; +use bytemuck::Pod; + +/// The radius of the spinning sphere of cubes. +const OUTER_RADIUS: f32 = 3.0; + +/// The density of cubes in the other sphere. +const OUTER_SUBDIVISION_COUNT: u32 = 5; + +/// The speed at which the outer sphere and large cube rotate in radians per +/// frame. +const ROTATION_SPEED: f32 = 0.01; + +/// The length of each side of the small cubes, in meters. +const SMALL_CUBE_SIZE: f32 = 0.1; + +/// The length of each side of the large cube, in meters. +const LARGE_CUBE_SIZE: f32 = 2.0; + +/// A marker component for the immediate parent of the large sphere of cubes. +#[derive(Default, Component)] +struct SphereParent; + +/// A marker component for the large spinning cube at the origin. +#[derive(Default, Component)] +struct LargeCube; + +/// A plugin for the render app that reads the number of culled meshes from the +/// GPU back to the CPU. +struct ReadbackIndirectParametersPlugin; + +/// The node that we insert into the render graph in order to read the number of +/// culled meshes from the GPU back to the CPU. +#[derive(Default)] +struct ReadbackIndirectParametersNode; + +/// The [`RenderLabel`] that we use to identify the +/// [`ReadbackIndirectParametersNode`]. +#[derive(Clone, PartialEq, Eq, Hash, Debug, RenderLabel)] +struct ReadbackIndirectParameters; + +/// The intermediate staging buffers that we use to read back the indirect +/// parameters from the GPU to the CPU. +/// +/// We read back the GPU indirect parameters so that we can determine the number +/// of meshes that were culled. +/// +/// `wgpu` doesn't allow us to read indirect buffers back from the GPU to the +/// CPU directly. Instead, we have to copy them to a temporary staging buffer +/// first, and then read *those* buffers back from the GPU to the CPU. This +/// resource holds those temporary buffers. +#[derive(Resource, Default)] +struct IndirectParametersStagingBuffers { + /// The buffer that stores the indirect draw commands. + /// + /// See [`IndirectParametersIndexed`] for more information about the memory + /// layout of this buffer. + data: Option, + /// The buffer that stores the *number* of indirect draw commands. + /// + /// We only care about the first `u32` in this buffer. + batch_sets: Option, +} + +/// A resource, shared between the main world and the render world, that saves a +/// CPU-side copy of the GPU buffer that stores the indirect draw parameters. +/// +/// This is needed so that we can display the number of meshes that were culled. +/// It's reference counted, and protected by a lock, because we don't precisely +/// know when the GPU will be ready to present the CPU with the buffer copy. +/// Even though the rendering runs at least a frame ahead of the main app logic, +/// we don't require more precise synchronization than the lock because we don't +/// really care how up-to-date the counter of culled meshes is. If it's off by a +/// few frames, that's no big deal. +#[derive(Clone, Resource, Default, Deref, DerefMut)] +struct SavedIndirectParameters(Arc>); + +/// A CPU-side copy of the GPU buffer that stores the indirect draw parameters. +/// +/// This is needed so that we can display the number of meshes that were culled. +#[derive(Default)] +struct SavedIndirectParametersData { + /// The CPU-side copy of the GPU buffer that stores the indirect draw + /// parameters. + data: Vec, + /// The CPU-side copy of the GPU buffer that stores the *number* of indirect + /// draw parameters that we have. + /// + /// All we care about is the number of indirect draw parameters for a single + /// view, so this is only one word in size. + count: u32, +} + +/// The demo's current settings. +#[derive(Resource)] +struct AppStatus { + /// Whether occlusion culling is presently enabled. + /// + /// By default, this is set to true. + occlusion_culling: bool, +} + +impl Default for AppStatus { + fn default() -> Self { + AppStatus { + occlusion_culling: true, + } + } +} + +fn main() { + App::new() + .add_plugins( + DefaultPlugins + .set(WindowPlugin { + primary_window: Some(Window { + title: "Bevy Occlusion Culling Example".into(), + ..default() + }), + ..default() + }) + .set(RenderPlugin { + allow_copies_from_indirect_parameters: true, + ..default() + }), + ) + .add_plugins(ReadbackIndirectParametersPlugin) + .init_resource::() + .add_systems(Startup, setup) + .add_systems(Update, spin_small_cubes) + .add_systems(Update, spin_large_cube) + .add_systems(Update, update_status_text) + .add_systems(Update, toggle_occlusion_culling_on_request) + .run(); +} + +impl Plugin for ReadbackIndirectParametersPlugin { + fn build(&self, app: &mut App) { + // Create the `SavedIndirectParameters` resource that we're going to use + // to communicate between the thread that the GPU-to-CPU readback + // callback runs on and the main application threads. This resource is + // atomically reference counted. We store one reference to the + // `SavedIndirectParameters` in the main app and another reference in + // the render app. + let saved_indirect_parameters = SavedIndirectParameters::default(); + app.insert_resource(saved_indirect_parameters.clone()); + + // Fetch the render app. + let Some(render_app) = app.get_sub_app_mut(RenderApp) else { + return; + }; + + render_app + .init_resource::() + // Insert another reference to the `SavedIndirectParameters`. + .insert_resource(saved_indirect_parameters) + .add_systems(ExtractSchedule, readback_indirect_parameters) + .add_systems( + Render, + create_indirect_parameters_staging_buffers.in_set(RenderSet::PrepareResourcesFlush), + ) + // Add the node that allows us to read the indirect parameters back + // from the GPU to the CPU, which allows us to determine how many + // meshes were culled. + .add_render_graph_node::( + Core3d, + ReadbackIndirectParameters, + ) + // We read back the indirect parameters any time after + // `EndMainPass`. Readback doesn't particularly need to execute + // before `EndMainPassPostProcessing`, but we specify that anyway + // because we want to make the indirect parameters run before + // *something* in the graph, and `EndMainPassPostProcessing` is a + // good a node as any other. + .add_render_graph_edges( + Core3d, + ( + Node3d::EndMainPass, + ReadbackIndirectParameters, + Node3d::EndMainPassPostProcessing, + ), + ); + } +} + +/// Spawns all the objects in the scene. +fn setup( + mut commands: Commands, + asset_server: Res, + mut meshes: ResMut>, + mut materials: ResMut>, +) { + spawn_small_cubes(&mut commands, &mut meshes, &mut materials); + spawn_large_cube(&mut commands, &asset_server, &mut meshes, &mut materials); + spawn_light(&mut commands); + spawn_camera(&mut commands); + spawn_help_text(&mut commands); +} + +/// Spawns the rotating sphere of small cubes. +fn spawn_small_cubes( + commands: &mut Commands, + meshes: &mut Assets, + materials: &mut Assets, +) { + // Add the cube mesh. + let small_cube = meshes.add(Cuboid::new( + SMALL_CUBE_SIZE, + SMALL_CUBE_SIZE, + SMALL_CUBE_SIZE, + )); + + // Add the cube material. + let small_cube_material = materials.add(StandardMaterial { + base_color: SILVER.into(), + ..default() + }); + + // Create the entity that the small cubes will be parented to. This is the + // entity that we rotate. + let sphere_parent = commands + .spawn(Transform::from_translation(Vec3::ZERO)) + .insert(Visibility::default()) + .insert(SphereParent) + .id(); + + // Now we have to figure out where to place the cubes. To do that, we create + // a sphere mesh, but we don't add it to the scene. Instead, we inspect the + // sphere mesh to find the positions of its vertices, and spawn a small cube + // at each one. That way, we end up with a bunch of cubes arranged in a + // spherical shape. + + // Create the sphere mesh, and extract the positions of its vertices. + let sphere = Sphere::new(OUTER_RADIUS) + .mesh() + .ico(OUTER_SUBDIVISION_COUNT) + .unwrap(); + let sphere_positions = sphere.attribute(Mesh::ATTRIBUTE_POSITION).unwrap(); + + // At each vertex, create a small cube. + for sphere_position in sphere_positions.as_float3().unwrap() { + let sphere_position = Vec3::from_slice(sphere_position); + let small_cube = commands + .spawn(Mesh3d(small_cube.clone())) + .insert(MeshMaterial3d(small_cube_material.clone())) + .insert(Transform::from_translation(sphere_position)) + .id(); + commands.entity(sphere_parent).add_child(small_cube); + } +} + +/// Spawns the large cube at the center of the screen. +/// +/// This cube rotates chaotically and occludes small cubes behind it. +fn spawn_large_cube( + commands: &mut Commands, + asset_server: &AssetServer, + meshes: &mut Assets, + materials: &mut Assets, +) { + commands + .spawn(Mesh3d(meshes.add(Cuboid::new( + LARGE_CUBE_SIZE, + LARGE_CUBE_SIZE, + LARGE_CUBE_SIZE, + )))) + .insert(MeshMaterial3d(materials.add(StandardMaterial { + base_color: WHITE.into(), + base_color_texture: Some(asset_server.load("branding/icon.png")), + ..default() + }))) + .insert(Transform::IDENTITY) + .insert(LargeCube); +} + +// Spins the outer sphere a bit every frame. +// +// This ensures that the set of cubes that are hidden and shown varies over +// time. +fn spin_small_cubes(mut sphere_parents: Query<&mut Transform, With>) { + for mut sphere_parent_transform in &mut sphere_parents { + sphere_parent_transform.rotate_y(ROTATION_SPEED); + } +} + +/// Spins the large cube a bit every frame. +/// +/// The chaotic rotation adds a bit of randomness to the scene to better +/// demonstrate the dynamicity of the occlusion culling. +fn spin_large_cube(mut large_cubes: Query<&mut Transform, With>) { + for mut transform in &mut large_cubes { + transform.rotate(Quat::from_euler( + EulerRot::XYZ, + 0.13 * ROTATION_SPEED, + 0.29 * ROTATION_SPEED, + 0.35 * ROTATION_SPEED, + )); + } +} + +/// Spawns a directional light to illuminate the scene. +fn spawn_light(commands: &mut Commands) { + commands + .spawn(DirectionalLight::default()) + .insert(Transform::from_rotation(Quat::from_euler( + EulerRot::ZYX, + 0.0, + PI * -0.15, + PI * -0.15, + ))); +} + +/// Spawns a camera that includes the depth prepass and occlusion culling. +fn spawn_camera(commands: &mut Commands) { + commands + .spawn(Camera3d::default()) + .insert(Transform::from_xyz(0.0, 0.0, 9.0).looking_at(Vec3::ZERO, Vec3::Y)) + .insert(DepthPrepass) + .insert(OcclusionCulling); +} + +/// Spawns the help text at the upper left of the screen. +fn spawn_help_text(commands: &mut Commands) { + commands.spawn(( + Text::new(""), + Node { + position_type: PositionType::Absolute, + top: Val::Px(12.0), + left: Val::Px(12.0), + ..default() + }, + )); +} + +impl render_graph::Node for ReadbackIndirectParametersNode { + fn run<'w>( + &self, + _: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + world: &'w World, + ) -> Result<(), NodeRunError> { + // Extract the buffers that hold the GPU indirect draw parameters from + // the world resources. We're going to read those buffers to determine + // how many meshes were actually drawn. + let (Some(indirect_parameters_buffers), Some(indirect_parameters_mapping_buffers)) = ( + world.get_resource::(), + world.get_resource::(), + ) else { + return Ok(()); + }; + + // Grab both the buffers we're copying from and the staging buffers + // we're copying to. Remember that we can't map the indirect parameters + // buffers directly, so we have to copy their contents to a staging + // buffer. + let ( + Some(indexed_data_buffer), + Some(indexed_batch_sets_buffer), + Some(indirect_parameters_staging_data_buffer), + Some(indirect_parameters_staging_batch_sets_buffer), + ) = ( + indirect_parameters_buffers.indexed_data_buffer(), + indirect_parameters_buffers.indexed_batch_sets_buffer(), + indirect_parameters_mapping_buffers.data.as_ref(), + indirect_parameters_mapping_buffers.batch_sets.as_ref(), + ) + else { + return Ok(()); + }; + + // Copy from the indirect parameters buffers to the staging buffers. + render_context.command_encoder().copy_buffer_to_buffer( + indexed_data_buffer, + 0, + indirect_parameters_staging_data_buffer, + 0, + indexed_data_buffer.size(), + ); + render_context.command_encoder().copy_buffer_to_buffer( + indexed_batch_sets_buffer, + 0, + indirect_parameters_staging_batch_sets_buffer, + 0, + indexed_batch_sets_buffer.size(), + ); + + Ok(()) + } +} + +/// Creates the staging buffers that we use to read back the indirect parameters +/// from the GPU to the CPU. +/// +/// We read the indirect parameters from the GPU to the CPU in order to display +/// the number of meshes that were culled each frame. +/// +/// We need these staging buffers because `wgpu` doesn't allow us to read the +/// contents of the indirect parameters buffers directly. We must first copy +/// them from the GPU to a staging buffer, and then read the staging buffer. +fn create_indirect_parameters_staging_buffers( + mut indirect_parameters_staging_buffers: ResMut, + indirect_parameters_buffers: Res, + render_device: Res, +) { + // Fetch the indirect parameters buffers that we're going to copy from. + let (Some(indexed_data_buffer), Some(indexed_batch_set_buffer)) = ( + indirect_parameters_buffers.indexed_data_buffer(), + indirect_parameters_buffers.indexed_batch_sets_buffer(), + ) else { + return; + }; + + // Build the staging buffers. Make sure they have the same sizes as the + // buffers we're copying from. + indirect_parameters_staging_buffers.data = + Some(render_device.create_buffer(&BufferDescriptor { + label: Some("indexed data staging buffer"), + size: indexed_data_buffer.size(), + usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST, + mapped_at_creation: false, + })); + indirect_parameters_staging_buffers.batch_sets = + Some(render_device.create_buffer(&BufferDescriptor { + label: Some("indexed batch set staging buffer"), + size: indexed_batch_set_buffer.size(), + usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST, + mapped_at_creation: false, + })); +} + +/// Updates the app status text at the top of the screen. +fn update_status_text( + saved_indirect_parameters: Res, + mut texts: Query<&mut Text>, + meshes: Query>, + app_status: Res, +) { + // How many meshes are in the scene? + let total_mesh_count = meshes.iter().count(); + + // Sample the rendered object count. Note that we don't synchronize beyond + // locking the data and therefore this will value will generally at least + // one frame behind. This is fine; this app is just a demonstration after + // all. + let rendered_object_count: u32 = { + let saved_indirect_parameters = saved_indirect_parameters.lock().unwrap(); + saved_indirect_parameters + .data + .iter() + .take(saved_indirect_parameters.count as usize) + .map(|indirect_parameters| indirect_parameters.instance_count) + .sum() + }; + + // Change the text. + for mut text in &mut texts { + text.0 = format!( + "Occlusion culling {} (Press Space to toggle)\n{}/{} meshes rendered", + if app_status.occlusion_culling { + "ON" + } else { + "OFF" + }, + rendered_object_count, + total_mesh_count + ); + } +} + +fn readback_indirect_parameters( + mut indirect_parameters_mapping_buffers: ResMut, + saved_indirect_parameters: Res, +) { + let (Some(data_buffer), Some(batch_sets_buffer)) = ( + indirect_parameters_mapping_buffers.data.take(), + indirect_parameters_mapping_buffers.batch_sets.take(), + ) else { + return; + }; + + let saved_indirect_parameters_0 = (**saved_indirect_parameters).clone(); + let saved_indirect_parameters_1 = (**saved_indirect_parameters).clone(); + readback_buffer::(data_buffer, move |indirect_parameters| { + saved_indirect_parameters_0.lock().unwrap().data = indirect_parameters.to_vec(); + }); + readback_buffer::(batch_sets_buffer, move |indirect_parameters_count| { + saved_indirect_parameters_1.lock().unwrap().count = indirect_parameters_count[0]; + }); +} + +// A helper function to asynchronously read an array of [`Pod`] values back from +// the GPU to the CPU. +// +// The given callback is invoked when the data is ready. The buffer will +// automatically be unmapped after the callback executes. +fn readback_buffer(buffer: Buffer, callback: impl FnOnce(&[T]) + Send + 'static) +where + T: Pod, +{ + // We need to make another reference to the buffer so that we can move the + // original reference into the closure below. + let original_buffer = buffer.clone(); + original_buffer + .slice(..) + .map_async(MapMode::Read, move |result| { + // Make sure we succeeded. + if result.is_err() { + return; + } + + { + // Cast the raw bytes in the GPU buffer to the appropriate type. + let buffer_view = buffer.slice(..).get_mapped_range(); + let indirect_parameters: &[T] = bytemuck::cast_slice( + &buffer_view[0..(buffer_view.len() / size_of::() * size_of::())], + ); + + // Invoke the callback. + callback(indirect_parameters); + } + + // Unmap the buffer. We have to do this before submitting any more + // GPU command buffers, or `wgpu` will assert. + buffer.unmap(); + }); +} + +/// Adds or removes the [`OcclusionCulling`] and [`DepthPrepass`] components +/// when the user presses the spacebar. +fn toggle_occlusion_culling_on_request( + mut commands: Commands, + input: Res>, + mut app_status: ResMut, + cameras: Query>, +) { + // Only run when the user presses the spacebar. + if !input.just_pressed(KeyCode::Space) { + return; + } + + // Toggle the occlusion culling flag in `AppStatus`. + app_status.occlusion_culling = !app_status.occlusion_culling; + + // Add or remove the `OcclusionCulling` and `DepthPrepass` components as + // requested. + for camera in &cameras { + if app_status.occlusion_culling { + commands + .entity(camera) + .insert(DepthPrepass) + .insert(OcclusionCulling); + } else { + commands + .entity(camera) + .remove::() + .remove::(); + } + } +} diff --git a/examples/README.md b/examples/README.md index c7a5d6b86dbd0a..7e41f784b9b41d 100644 --- a/examples/README.md +++ b/examples/README.md @@ -161,6 +161,7 @@ Example | Description [Meshlet](../examples/3d/meshlet.rs) | Meshlet rendering for dense high-poly scenes (experimental) [Mixed lighting](../examples/3d/mixed_lighting.rs) | Demonstrates how to combine baked and dynamic lighting [Motion Blur](../examples/3d/motion_blur.rs) | Demonstrates per-pixel motion blur +[Occlusion Culling](../examples/3d/occlusion_culling.rs) | Demonstration of Occlusion Culling [Order Independent Transparency](../examples/3d/order_independent_transparency.rs) | Demonstrates how to use OIT [Orthographic View](../examples/3d/orthographic.rs) | Shows how to create a 3D orthographic view (for isometric-look in games or CAD applications) [Parallax Mapping](../examples/3d/parallax_mapping.rs) | Demonstrates use of a normal map and depth map for parallax mapping diff --git a/examples/shader/specialized_mesh_pipeline.rs b/examples/shader/specialized_mesh_pipeline.rs index bc1ecf113c78b0..9cc414e26a3921 100644 --- a/examples/shader/specialized_mesh_pipeline.rs +++ b/examples/shader/specialized_mesh_pipeline.rs @@ -6,8 +6,6 @@ //! //! [`SpecializedMeshPipeline`] let's you customize the entire pipeline used when rendering a mesh. -use std::any::TypeId; - use bevy::{ core_pipeline::core_3d::{Opaque3d, Opaque3dBatchSetKey, Opaque3dBinKey, CORE_3D_DEPTH_FORMAT}, ecs::system::StaticSystemParam, @@ -21,11 +19,11 @@ use bevy::{ batching::GetFullBatchData, batching::{ gpu_preprocessing::{ - BatchedInstanceBuffers, IndirectParametersBuffers, PreprocessWorkItem, - PreprocessWorkItemBuffers, + self, BatchedInstanceBuffers, IndirectParametersBuffers, PreprocessWorkItem, }, GetBatchData, }, + experimental::occlusion_culling::OcclusionCulling, extract_component::{ExtractComponent, ExtractComponentPlugin}, mesh::{Indices, MeshVertexBufferLayoutRef, PrimitiveTopology, RenderMesh}, render_asset::{RenderAssetUsages, RenderAssets}, @@ -43,7 +41,6 @@ use bevy::{ view::{self, ExtractedView, RenderVisibleEntities, ViewTarget, VisibilityClass}, Render, RenderApp, RenderSet, }, - utils::TypeIdMap, }; const SHADER_ASSET_PATH: &str = "shaders/specialized_mesh_pipeline.wgsl"; @@ -288,6 +285,7 @@ fn queue_custom_mesh_pipeline( &ExtractedView, &Msaa, Has, + Has, )>, (render_meshes, render_mesh_instances): ( Res>, @@ -307,6 +305,8 @@ fn queue_custom_mesh_pipeline( let BatchedInstanceBuffers { ref mut data_buffer, ref mut work_item_buffers, + ref mut late_indexed_indirect_parameters_buffer, + ref mut late_non_indexed_indirect_parameters_buffer, .. } = gpu_array_buffer.into_inner(); @@ -318,7 +318,15 @@ fn queue_custom_mesh_pipeline( // Render phases are per-view, so we need to iterate over all views so that // the entity appears in them. (In this example, we have only one view, but // it's good practice to loop over all views anyway.) - for (view_entity, view_visible_entities, view, msaa, no_indirect_drawing) in views.iter() { + for ( + view_entity, + view_visible_entities, + view, + msaa, + no_indirect_drawing, + gpu_occlusion_culling, + ) in views.iter() + { let Some(opaque_phase) = opaque_render_phases.get_mut(&view.retained_view_entity) else { continue; }; @@ -326,11 +334,14 @@ fn queue_custom_mesh_pipeline( // Create a *work item buffer* if necessary. Work item buffers store the // indices of meshes that are to be rendered when indirect drawing is // enabled. - let work_item_buffer = work_item_buffers - .entry(view_entity) - .or_insert_with(TypeIdMap::default) - .entry(TypeId::of::()) - .or_insert_with(|| PreprocessWorkItemBuffers::new(no_indirect_drawing)); + let work_item_buffer = gpu_preprocessing::get_or_create_work_item_buffer::( + work_item_buffers, + view_entity, + no_indirect_drawing, + gpu_occlusion_culling, + late_indexed_indirect_parameters_buffer, + late_non_indexed_indirect_parameters_buffer, + ); // Create the key based on the view. In this case we only care about MSAA and HDR let view_key = MeshPipelineKey::from_msaa_samples(msaa.samples()) diff --git a/examples/tools/scene_viewer/main.rs b/examples/tools/scene_viewer/main.rs index af8272a13bec01..ae78dbe077e985 100644 --- a/examples/tools/scene_viewer/main.rs +++ b/examples/tools/scene_viewer/main.rs @@ -4,11 +4,15 @@ //! replacing the path as appropriate. //! In case of multiple scenes, you can select which to display by adapting the file path: `/path/to/model.gltf#Scene1`. //! With no arguments it will load the `FlightHelmet` glTF model from the repository assets subdirectory. +//! Pass `--help` to see all the supported arguments. //! //! If you want to hot reload asset changes, enable the `file_watcher` cargo feature. +use argh::FromArgs; use bevy::{ + core_pipeline::prepass::DepthPrepass, prelude::*, + render::experimental::occlusion_culling::OcclusionCulling, render::primitives::{Aabb, Sphere}, }; @@ -24,7 +28,26 @@ use camera_controller::{CameraController, CameraControllerPlugin}; use morph_viewer_plugin::MorphViewerPlugin; use scene_viewer_plugin::{SceneHandle, SceneViewerPlugin}; +/// A simple glTF scene viewer made with Bevy +#[derive(FromArgs, Resource)] +struct Args { + /// the path to the glTF scene + #[argh( + positional, + default = "\"assets/models/FlightHelmet/FlightHelmet.gltf\".to_string()" + )] + scene_path: String, + /// enable occlusion culling + #[argh(switch)] + occlusion_culling: Option, +} + fn main() { + #[cfg(not(target_arch = "wasm32"))] + let args: Args = argh::from_env(); + #[cfg(target_arch = "wasm32")] + let args: Args = Args::from_args(&[], &[]).unwrap(); + let mut app = App::new(); app.add_plugins(( DefaultPlugins @@ -43,6 +66,7 @@ fn main() { SceneViewerPlugin, MorphViewerPlugin, )) + .insert_resource(args) .add_systems(Startup, setup) .add_systems(PreUpdate, setup_scene_after_load); @@ -67,12 +91,10 @@ fn parse_scene(scene_path: String) -> (String, usize) { (scene_path, 0) } -fn setup(mut commands: Commands, asset_server: Res) { - let scene_path = std::env::args() - .nth(1) - .unwrap_or_else(|| "assets/models/FlightHelmet/FlightHelmet.gltf".to_string()); +fn setup(mut commands: Commands, asset_server: Res, args: Res) { + let scene_path = &args.scene_path; info!("Loading {}", scene_path); - let (file_path, scene_index) = parse_scene(scene_path); + let (file_path, scene_index) = parse_scene((*scene_path).clone()); commands.insert_resource(SceneHandle::new(asset_server.load(file_path), scene_index)); } @@ -82,6 +104,7 @@ fn setup_scene_after_load( mut setup: Local, mut scene_handle: ResMut, asset_server: Res, + args: Res, meshes: Query<(&GlobalTransform, Option<&Aabb>), With>, ) { if scene_handle.is_loaded && !*setup { @@ -125,7 +148,7 @@ fn setup_scene_after_load( info!("{}", camera_controller); info!("{}", *scene_handle); - commands.spawn(( + let mut camera = commands.spawn(( Camera3d::default(), Projection::from(projection), Transform::from_translation(Vec3::from(aabb.center) + size * Vec3::new(0.5, 0.25, 0.5)) @@ -145,6 +168,12 @@ fn setup_scene_after_load( camera_controller, )); + // If occlusion culling was requested, include the relevant components. + // The Z-prepass is currently required. + if args.occlusion_culling == Some(true) { + camera.insert((DepthPrepass, OcclusionCulling)); + } + // Spawn a default light if the scene does not have one if !scene_handle.has_light { info!("Spawning a directional light");