From 31ed43c340ee006eba508fcfd3c8eb746970c88c Mon Sep 17 00:00:00 2001 From: Shane Peelar Date: Sun, 6 Nov 2022 16:29:20 -0500 Subject: [PATCH] Implement batched query support * Only Dense queries are accelerated currently * Certain features are awaiting on GATs/generic const expressions * Code refactored to use GATs * `elain` crate now used to provide const generic alignments (PhantomData method) * Code still requires a fixed alignment request across whole query (in progress) * Batches support AsRef, AsMut, etc * Simplified calling for_each_{mut_}batched (no longer need _ arguments) * Add convenience functions for treating AlignedBatch16 as Vec4s * Add a map API for creating projections of aligned batches * Add a compile-time error in case a greater alignment is needed than the current batch alignment to satisfy T * Documentation about SIMD and batching * ALIGN now referred to as MIN_ALIGN for clarity --- crates/bevy_ecs/Cargo.toml | 1 + crates/bevy_ecs/src/archetype.rs | 6 +- crates/bevy_ecs/src/change_detection.rs | 145 ++++++++- crates/bevy_ecs/src/query/fetch.rs | 285 +++++++++++++++- crates/bevy_ecs/src/query/filter.rs | 4 +- crates/bevy_ecs/src/query/iter.rs | 9 +- crates/bevy_ecs/src/query/mod.rs | 112 +++++++ crates/bevy_ecs/src/query/state.rs | 215 +++++++++++- crates/bevy_ecs/src/storage/aligned_vec.rs | 258 +++++++++++++++ crates/bevy_ecs/src/storage/blob_vec.rs | 32 +- crates/bevy_ecs/src/storage/mod.rs | 1 + crates/bevy_ecs/src/storage/sparse_set.rs | 15 +- crates/bevy_ecs/src/storage/table.rs | 36 ++- crates/bevy_ecs/src/system/query.rs | 254 ++++++++++++++- crates/bevy_ecs/src/world/world_cell.rs | 2 +- crates/bevy_ptr/Cargo.toml | 3 + crates/bevy_ptr/src/batch.rs | 360 +++++++++++++++++++++ crates/bevy_ptr/src/lib.rs | 83 ++++- 18 files changed, 1757 insertions(+), 64 deletions(-) create mode 100644 crates/bevy_ecs/src/storage/aligned_vec.rs create mode 100644 crates/bevy_ptr/src/batch.rs diff --git a/crates/bevy_ecs/Cargo.toml b/crates/bevy_ecs/Cargo.toml index 29769f6e45e628..b0a1f132c64ed0 100644 --- a/crates/bevy_ecs/Cargo.toml +++ b/crates/bevy_ecs/Cargo.toml @@ -30,6 +30,7 @@ serde = { version = "1", features = ["derive"] } [dev-dependencies] rand = "0.8" +bevy_math = { path = "../bevy_math", version = "0.9.0-dev" } [[example]] name = "events" diff --git a/crates/bevy_ecs/src/archetype.rs b/crates/bevy_ecs/src/archetype.rs index 0fc9271a42c6cf..c6eeb39bdccf33 100644 --- a/crates/bevy_ecs/src/archetype.rs +++ b/crates/bevy_ecs/src/archetype.rs @@ -5,7 +5,7 @@ use crate::{ bundle::BundleId, component::{ComponentId, StorageType}, entity::{Entity, EntityLocation}, - storage::{SparseArray, SparseSet, SparseSetIndex, TableId}, + storage::{aligned_vec::SimdAlignedVec, SparseArray, SparseSet, SparseSetIndex, TableId}, }; use std::{ collections::HashMap, @@ -181,7 +181,7 @@ pub struct Archetype { id: ArchetypeId, table_id: TableId, edges: Edges, - entities: Vec, + entities: SimdAlignedVec, table_components: Box<[ComponentId]>, sparse_set_components: Box<[ComponentId]>, components: SparseSet, @@ -225,7 +225,7 @@ impl Archetype { Self { id, table_id, - entities: Vec::new(), + entities: SimdAlignedVec::new(), components, table_components, sparse_set_components, diff --git a/crates/bevy_ecs/src/change_detection.rs b/crates/bevy_ecs/src/change_detection.rs index b8d1f7c196d352..61114cf974b43c 100644 --- a/crates/bevy_ecs/src/change_detection.rs +++ b/crates/bevy_ecs/src/change_detection.rs @@ -1,7 +1,15 @@ //! Types that detect when their internal data mutate. +use crate::ptr::{ + batch::AlignedBatch, + elain::{Align, Alignment}, +}; + use crate::{component::ComponentTicks, ptr::PtrMut, system::Resource}; -use std::ops::{Deref, DerefMut}; +use std::{ + marker::PhantomData, + ops::{Deref, DerefMut}, +}; /// The (arbitrarily chosen) minimum number of world tick increments between `check_tick` scans. /// @@ -229,6 +237,15 @@ pub(crate) struct Ticks<'a> { pub(crate) change_tick: u32, } +pub(crate) struct TicksBatch<'a, const N: usize, const MIN_ALIGN: usize> +where + Align: Alignment, +{ + pub(crate) component_ticks: &'a mut AlignedBatch, + pub(crate) last_change_tick: u32, + pub(crate) change_tick: u32, +} + /// Unique mutable borrow of a [`Resource`]. /// /// See the [`Resource`] documentation for usage. @@ -352,6 +369,132 @@ change_detection_impl!(Mut<'a, T>, T,); impl_methods!(Mut<'a, T>, T,); impl_debug!(Mut<'a, T>,); +/// Unique mutable borrow of an entity's component (batched version). +/// Each batch changes in unison: a batch has changed if any of its elements have changed. +pub struct MutBatch<'a, T, const N: usize, const MIN_ALIGN: usize> +where + Align: Alignment, +{ + pub(crate) value: &'a mut AlignedBatch, + pub(crate) ticks: TicksBatch<'a, N, MIN_ALIGN>, + pub(crate) _marker: PhantomData, +} + +impl<'a, T, const N: usize, const MIN_ALIGN: usize> DetectChanges for MutBatch<'a, T, N, MIN_ALIGN> +where + Align: Alignment, +{ + #[inline] + fn is_added(&self) -> bool { + self.ticks + .component_ticks + .as_array() + .iter() + .any(|x| x.is_added(self.ticks.last_change_tick, self.ticks.change_tick)) + } + + #[inline] + fn is_changed(&self) -> bool { + self.ticks + .component_ticks + .as_array() + .iter() + .any(|x| x.is_changed(self.ticks.last_change_tick, self.ticks.change_tick)) + } + + #[inline] + fn set_changed(&mut self) { + for ticks in self.ticks.component_ticks.as_array_mut().iter_mut() { + ticks.set_changed(self.ticks.change_tick); + } + } + + #[inline] + fn last_changed(&self) -> u32 { + self.ticks.last_change_tick + } + + type Inner = AlignedBatch; + + fn set_last_changed(&mut self, last_change_tick: u32) { + self.ticks.last_change_tick = last_change_tick; + } + + fn bypass_change_detection(&mut self) -> &mut Self::Inner { + self.value + } +} + +impl<'a, T, const N: usize, const MIN_ALIGN: usize> Deref for MutBatch<'a, T, N, MIN_ALIGN> +where + Align: Alignment, +{ + type Target = AlignedBatch; + + #[inline] + fn deref(&self) -> &Self::Target { + self.value + } +} + +impl<'a, T, const N: usize, const MIN_ALIGN: usize> DerefMut for MutBatch<'a, T, N, MIN_ALIGN> +where + Align: Alignment, +{ + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + self.set_changed(); + self.value + } +} + +impl<'a, T, const N: usize, const MIN_ALIGN: usize> AsRef> + for MutBatch<'a, T, N, MIN_ALIGN> +where + Align: Alignment, +{ + #[inline] + fn as_ref(&self) -> &AlignedBatch { + self.deref() + } +} + +impl<'a, T, const N: usize, const MIN_ALIGN: usize> AsMut> + for MutBatch<'a, T, N, MIN_ALIGN> +where + Align: Alignment, +{ + #[inline] + fn as_mut(&mut self) -> &mut AlignedBatch { + self.deref_mut() + } +} + +impl<'a, T, const N: usize, const MIN_ALIGN: usize> MutBatch<'a, T, N, MIN_ALIGN> +where + Align: Alignment, +{ + /// Consume `self` and return a mutable reference to the + /// contained value while marking `self` as "changed". + #[inline] + pub fn into_inner(mut self) -> &'a mut AlignedBatch { + self.set_changed(); + self.value + } +} + +impl<'a, T, const N: usize, const MIN_ALIGN: usize> std::fmt::Debug + for MutBatch<'a, T, N, MIN_ALIGN> +where + Align: Alignment, + AlignedBatch: std::fmt::Debug, + T: std::fmt::Debug, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple(stringify!($name)).field(&self.value).finish() + } +} + /// Unique mutable borrow of resources or an entity's component. /// /// Similar to [`Mut`], but not generic over the component type, instead diff --git a/crates/bevy_ecs/src/query/fetch.rs b/crates/bevy_ecs/src/query/fetch.rs index ad6598bcc6e0d6..81c93cd47acf9d 100644 --- a/crates/bevy_ecs/src/query/fetch.rs +++ b/crates/bevy_ecs/src/query/fetch.rs @@ -1,15 +1,20 @@ use crate::{ archetype::{Archetype, ArchetypeComponentId}, - change_detection::Ticks, + change_detection::{MutBatch, Ticks, TicksBatch}, component::{Component, ComponentId, ComponentStorage, ComponentTicks, StorageType}, entity::Entity, + ptr::{ + batch::AlignedBatch, + elain::{Align, Alignment}, + ThinSimdAlignedSlicePtr, UnsafeCellDeref, + }, query::{Access, DebugCheckedUnwrap, FilteredAccess}, storage::{ComponentSparseSet, Table}, world::{Mut, World}, }; + use bevy_ecs_macros::all_tuples; pub use bevy_ecs_macros::WorldQuery; -use bevy_ptr::{ThinSlicePtr, UnsafeCellDeref}; use std::{cell::UnsafeCell, marker::PhantomData}; /// Types that can be fetched from a [`World`] using a [`Query`]. @@ -426,6 +431,30 @@ pub unsafe trait WorldQuery { ) -> bool; } +/// An extension of [`WorldQuery`] for batched queries. +pub trait WorldQueryBatch: WorldQuery { + type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> + where + Align: Alignment; + + /// Retrieve a batch of size `N` with desired alignment `ALIGN` from the current table. + /// # Safety + /// + /// `table_row_start` is a valid table row index for the current table + /// `table_row_start` + `N` is a valid table row index for the current table + /// `table_row_start` is a multiple of `N` + /// + /// Must always be called _after_ [`WorldQuery::set_table`]. + unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>( + fetch: &mut ::Fetch<'w>, + entity_batch: &'w AlignedBatch, + table_row_start: usize, + len: usize, + ) -> Self::FullBatch<'w, N, MIN_ALIGN> + where + Align: Alignment; +} + /// A world query that is read only. /// /// # Safety @@ -437,11 +466,17 @@ pub unsafe trait ReadOnlyWorldQuery: WorldQuery {} pub type QueryFetch<'w, Q> = ::Fetch<'w>; /// The item type returned when a [`WorldQuery`] is iterated over pub type QueryItem<'w, Q> = ::Item<'w>; +/// The item type returned when a [`WorldQuery`] is iterated over in a batched fashion +pub type QueryBatch<'w, Q, const N: usize, const MIN_ALIGN: usize> = + ::FullBatch<'w, N, MIN_ALIGN>; /// The read-only `Fetch` of a [`WorldQuery`], which is used to store state for each archetype/table. pub type ROQueryFetch<'w, Q> = QueryFetch<'w, ::ReadOnly>; /// The read-only variant of the item type returned when a [`WorldQuery`] is iterated over immutably pub type ROQueryItem<'w, Q> = QueryItem<'w, ::ReadOnly>; +pub type ROQueryBatch<'w, Q, const N: usize, const MIN_ALIGN: usize> = + QueryBatch<'w, ::ReadOnly, N, MIN_ALIGN>; + /// SAFETY: no component or archetype access unsafe impl WorldQuery for Entity { type Fetch<'w> = (); @@ -508,13 +543,32 @@ unsafe impl WorldQuery for Entity { } } +impl WorldQueryBatch for Entity { + type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> = &'w AlignedBatch + where + Align: Alignment; + + #[inline] + unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>( + _fetch: &mut ::Fetch<'w>, + entity_batch: &'w AlignedBatch, + _table_row_start: usize, + _len: usize, + ) -> ::FullBatch<'w, N, MIN_ALIGN> + where + Align: Alignment, + { + entity_batch + } +} + /// SAFETY: access is read only unsafe impl ReadOnlyWorldQuery for Entity {} #[doc(hidden)] pub struct ReadFetch<'w, T> { // T::Storage = TableStorage - table_components: Option>>, + table_components: Option>>, // T::Storage = SparseStorage sparse_set: Option<&'w ComponentSparseSet>, } @@ -587,7 +641,6 @@ unsafe impl WorldQuery for &T { .get_column(component_id) .debug_checked_unwrap() .get_data_slice() - .into(), ); } @@ -649,16 +702,39 @@ unsafe impl WorldQuery for &T { /// SAFETY: access is read only unsafe impl ReadOnlyWorldQuery for &T {} +impl WorldQueryBatch for &T { + type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> = &'w AlignedBatch + where + Align: Alignment; + + #[inline] + unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>( + fetch: &mut ::Fetch<'w>, + _entity_batch: &'w AlignedBatch, + table_row_start: usize, + len: usize, + ) -> ::FullBatch<'w, N, MIN_ALIGN> + where + Align: Alignment, + { + //TODO: when generalized const expresions are stable, want the following: + //gcd::euclid_usize(ptr::MAX_SIMD_ALIGNMENT, N * core::mem::size_of::()); + + let components = fetch.table_components.debug_checked_unwrap(); + + components.get_batch_aligned_deref::(table_row_start, len) + } +} + #[doc(hidden)] pub struct WriteFetch<'w, T> { // T::Storage = TableStorage table_data: Option<( - ThinSlicePtr<'w, UnsafeCell>, - ThinSlicePtr<'w, UnsafeCell>, + ThinSimdAlignedSlicePtr<'w, UnsafeCell>, + ThinSimdAlignedSlicePtr<'w, UnsafeCell>, )>, // T::Storage = SparseStorage sparse_set: Option<&'w ComponentSparseSet>, - last_change_tick: u32, change_tick: u32, } @@ -732,8 +808,8 @@ unsafe impl<'__w, T: Component> WorldQuery for &'__w mut T { ) { let column = table.get_column(component_id).debug_checked_unwrap(); fetch.table_data = Some(( - column.get_data_slice().into(), - column.get_ticks_slice().into(), + column.get_data_slice(), + column.get_ticks_slice(), )); } @@ -807,6 +883,44 @@ unsafe impl<'__w, T: Component> WorldQuery for &'__w mut T { } } +impl<'__w, T: Component> WorldQueryBatch for &'__w mut T { + type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> = MutBatch<'w, T, N, MIN_ALIGN> + where + Align: Alignment + ; + + //FIXME: + /* + T: AlignedBatchGat, + ComponentTicks: AlignedBatchGat, + */ + + #[inline] + unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>( + fetch: &mut ::Fetch<'w>, + _entity_batch: &'w AlignedBatch, + table_row_start: usize, + len: usize, + ) -> ::FullBatch<'w, N, MIN_ALIGN> + where + Align: Alignment, + { + let (table_components, table_ticks) = fetch.table_data.debug_checked_unwrap(); + + MutBatch:: { + value: table_components.get_batch_aligned_deref_mut::(table_row_start, len), + ticks: TicksBatch { + // SAFETY: [table_row_start..+batch.len()] is in range + component_ticks: table_ticks + .get_batch_aligned_deref_mut::(table_row_start, len), + change_tick: fetch.change_tick, + last_change_tick: fetch.last_change_tick, + }, + _marker: PhantomData, + } + } +} + #[doc(hidden)] pub struct OptionFetch<'w, T: WorldQuery> { fetch: T::Fetch<'w>, @@ -911,6 +1025,34 @@ unsafe impl WorldQuery for Option { } } +impl WorldQueryBatch for Option { + type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> = Option> + where + Align: Alignment; + + #[inline] + unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>( + fetch: &mut ::Fetch<'w>, + entity_batch: &'w AlignedBatch, + table_row_start: usize, + len: usize, + ) -> ::FullBatch<'w, N, MIN_ALIGN> + where + Align: Alignment, + { + if fetch.matches { + Some(T::fetch_batched( + &mut fetch.fetch, + entity_batch, + table_row_start, + len, + )) + } else { + None + } + } +} + /// SAFETY: [`OptionFetch`] is read only because `T` is read only unsafe impl ReadOnlyWorldQuery for Option {} @@ -989,10 +1131,46 @@ impl ChangeTrackers { } } +/// A batch of [`ChangeTrackers`]. This is used when performing queries with Change Trackers using the +/// [`Query::for_each_mut_batched`](crate::system::Query::for_each_mut_batched) and [`Query::for_each_batched`](crate::system::Query::for_each_batched) functions. +#[derive(Clone)] +pub struct ChangeTrackersBatch<'a, T, const N: usize, const MIN_ALIGN: usize> +where + Align: Alignment, +{ + pub(crate) component_ticks: &'a AlignedBatch, + pub(crate) last_change_tick: u32, + pub(crate) change_tick: u32, + marker: PhantomData, +} + +impl<'a, T: Component, const N: usize, const MIN_ALIGN: usize> ChangeTrackersBatch<'a, T, N, MIN_ALIGN> +where + Align: Alignment, +{ + /// Returns true if this component has been added since the last execution of this system. + #[inline] + pub fn is_added(&self) -> bool { + self.component_ticks + .as_array() + .iter() + .any(|x| x.is_added(self.last_change_tick, self.change_tick)) + } + + /// Returns true if this component has been changed since the last execution of this system. + #[inline] + pub fn is_changed(&self) -> bool { + self.component_ticks + .as_array() + .iter() + .any(|x| x.is_changed(self.last_change_tick, self.change_tick)) + } +} + #[doc(hidden)] pub struct ChangeTrackersFetch<'w, T> { // T::Storage = TableStorage - table_ticks: Option>>, + table_ticks: Option>>, // T::Storage = SparseStorage sparse_set: Option<&'w ComponentSparseSet>, @@ -1075,7 +1253,6 @@ unsafe impl WorldQuery for ChangeTrackers { .get_column(id) .debug_checked_unwrap() .get_ticks_slice() - .into(), ); } @@ -1140,6 +1317,35 @@ unsafe impl WorldQuery for ChangeTrackers { } } +impl WorldQueryBatch for ChangeTrackers { + //FIXME: ComponentTicks: AlignedBatchGat, + type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> = ChangeTrackersBatch<'w, T, N, MIN_ALIGN> + where + Align: Alignment; + + #[inline] + unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>( + fetch: &mut ::Fetch<'w>, + _entity_batch: &'w AlignedBatch, + table_row_start: usize, + len: usize, + ) -> Self::FullBatch<'w, N, MIN_ALIGN> + where + Align: Alignment, + { + ChangeTrackersBatch { + component_ticks: { + let table_ticks = fetch.table_ticks.debug_checked_unwrap(); + + table_ticks.get_batch_aligned_deref::(table_row_start, len) + }, + marker: PhantomData, + last_change_tick: fetch.last_change_tick, + change_tick: fetch.change_tick, + } + } +} + /// SAFETY: access is read only unsafe impl ReadOnlyWorldQuery for ChangeTrackers {} @@ -1239,9 +1445,34 @@ macro_rules! impl_tuple_fetch { } } + //FIXME: the magic can happen here with different ALIGNments!!! + #[allow(unused_variables)] + #[allow(non_snake_case)] + #[allow(clippy::unused_unit)] + impl< $($name: WorldQueryBatch),*> WorldQueryBatch for ($($name,)*) + { + type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> = ($($name::FullBatch<'w, N, MIN_ALIGN>,)*) + where + Align: Alignment + ; + + #[inline] + unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>( + _fetch: &mut ::Fetch<'w>, + _entity_batch: &'w AlignedBatch, + _table_row_start: usize, + _len: usize, + ) -> ::FullBatch<'w, N, MIN_ALIGN> + where + Align: Alignment + { + let ($($name,)*) = _fetch; + ($($name::fetch_batched($name, _entity_batch, _table_row_start, _len),)*) + } + } + /// SAFETY: each item in the tuple is read only unsafe impl<$($name: ReadOnlyWorldQuery),*> ReadOnlyWorldQuery for ($($name,)*) {} - }; } @@ -1384,6 +1615,36 @@ macro_rules! impl_anytuple_fetch { /// SAFETY: each item in the tuple is read only unsafe impl<$($name: ReadOnlyWorldQuery),*> ReadOnlyWorldQuery for AnyOf<($($name,)*)> {} + //FIXME: magic can happen here, too, for different ALIGNments + #[allow(unused_variables)] + #[allow(non_snake_case)] + #[allow(clippy::unused_unit)] + impl<$($name: WorldQueryBatch),*> WorldQueryBatch for AnyOf<($($name,)*)> + { + type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> = ($(Option<$name::FullBatch<'w, N, MIN_ALIGN>>,)*) + where + Align: Alignment; + + + #[inline] + unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>( + _fetch: &mut ::Fetch<'w>, + _entity_batch: &'w AlignedBatch, + _table_row_start: usize, + _len: usize, + ) -> ::FullBatch<'w, N, MIN_ALIGN> + where + Align: Alignment + { + let ($($name,)*) = _fetch; + + ($( + $name.1.then(|| $name::fetch_batched(&mut $name.0, _entity_batch, _table_row_start, _len)), + )*) + + } + } + }; } diff --git a/crates/bevy_ecs/src/query/filter.rs b/crates/bevy_ecs/src/query/filter.rs index 2cf2fc3e1c10ba..bc1d738cd46359 100644 --- a/crates/bevy_ecs/src/query/filter.rs +++ b/crates/bevy_ecs/src/query/filter.rs @@ -2,12 +2,12 @@ use crate::{ archetype::{Archetype, ArchetypeComponentId}, component::{Component, ComponentId, ComponentStorage, ComponentTicks, StorageType}, entity::Entity, + ptr::{ThinSimdAlignedSlicePtr, UnsafeCellDeref}, query::{Access, DebugCheckedUnwrap, FilteredAccess, WorldQuery}, storage::{ComponentSparseSet, Table}, world::World, }; use bevy_ecs_macros::all_tuples; -use bevy_ptr::{ThinSlicePtr, UnsafeCellDeref}; use std::{cell::UnsafeCell, marker::PhantomData}; use super::ReadOnlyWorldQuery; @@ -413,7 +413,7 @@ macro_rules! impl_tick_filter { #[doc(hidden)] $(#[$fetch_meta])* pub struct $fetch_name<'w, T> { - table_ticks: Option>>, + table_ticks: Option>>, marker: PhantomData, sparse_set: Option<&'w ComponentSparseSet>, last_change_tick: u32, diff --git a/crates/bevy_ecs/src/query/iter.rs b/crates/bevy_ecs/src/query/iter.rs index 8924c54829a2d5..115a59ced88bba 100644 --- a/crates/bevy_ecs/src/query/iter.rs +++ b/crates/bevy_ecs/src/query/iter.rs @@ -2,6 +2,7 @@ use crate::{ archetype::{ArchetypeEntity, ArchetypeId, Archetypes}, entity::{Entities, Entity}, prelude::World, + ptr::ThinSimdAlignedSlicePtr, query::{ArchetypeFilter, DebugCheckedUnwrap, QueryState, WorldQuery}, storage::{TableId, Tables}, }; @@ -469,7 +470,7 @@ impl<'w, 's, Q: ReadOnlyWorldQuery, F: ReadOnlyWorldQuery, const K: usize> Fused struct QueryIterationCursor<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> { table_id_iter: std::slice::Iter<'s, TableId>, archetype_id_iter: std::slice::Iter<'s, ArchetypeId>, - table_entities: &'w [Entity], + table_entities: ThinSimdAlignedSlicePtr<'w, Entity>, archetype_entities: &'w [ArchetypeEntity], fetch: Q::Fetch<'w>, filter: F::Fetch<'w>, @@ -540,7 +541,7 @@ impl<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> QueryIterationCursor<'w, 's, QueryIterationCursor { fetch, filter, - table_entities: &[], + table_entities: ThinSimdAlignedSlicePtr::dangling(), archetype_entities: &[], table_id_iter: query_state.matched_table_ids.iter(), archetype_id_iter: query_state.matched_archetype_ids.iter(), @@ -556,7 +557,7 @@ impl<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> QueryIterationCursor<'w, 's, if self.current_index > 0 { let index = self.current_index - 1; if Self::IS_DENSE { - let entity = self.table_entities.get_unchecked(index); + let entity = self.table_entities.get(index); Some(Q::fetch(&mut self.fetch, *entity, index)) } else { let archetype_entity = self.archetype_entities.get_unchecked(index); @@ -602,7 +603,7 @@ impl<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> QueryIterationCursor<'w, 's, // SAFETY: set_table was called prior. // `current_index` is a table row in range of the current table, because if it was not, then the if above would have been executed. - let entity = self.table_entities.get_unchecked(self.current_index); + let entity = self.table_entities.get(self.current_index); if !F::filter_fetch(&mut self.filter, *entity, self.current_index) { self.current_index += 1; continue; diff --git a/crates/bevy_ecs/src/query/mod.rs b/crates/bevy_ecs/src/query/mod.rs index ac0767f4c08c3e..1f7f6a546d9d44 100644 --- a/crates/bevy_ecs/src/query/mod.rs +++ b/crates/bevy_ecs/src/query/mod.rs @@ -75,6 +75,9 @@ mod tests { #[derive(Component, Debug, Eq, PartialEq, Clone, Copy)] struct D(usize); + #[derive(Component)] + struct E; + #[derive(Component, Debug, Eq, PartialEq, Clone, Copy)] #[component(storage = "SparseSet")] struct Sparse(usize); @@ -783,6 +786,115 @@ mod tests { } } + #[test] + fn batched_queries() { + let mut world = World::new(); + + world.spawn_batch( + (0..127) + .into_iter() + .map(|i| (A(4 * i), B(4 * i + 1), C(4 * i + 2), D(4 * i + 3))), + ); + + fn system_compute(mut q: Query<(&mut A, &B, &C, &D)>) { + let mut scalar_counter = 0; + let mut batch_counter = 0; + + q.for_each_mut_batched::<4, 16>( + |(mut a, b, c, d)| { + assert_eq!(a.ticks.component_ticks.added, 1); + assert_eq!(a.ticks.component_ticks.changed, 1); + + a.0 += b.0 + c.0 + d.0; + scalar_counter += 1; + + assert_eq!(a.ticks.component_ticks.added, 1); + assert_eq!(a.ticks.component_ticks.changed, 2); + }, + |(mut a, b, c, d)| { + for ticks in a.ticks.component_ticks.as_array().iter() { + assert_eq!(ticks.added, 1); + assert_eq!(ticks.changed, 1); + } + + assert_eq!( + *a.as_array(), + [ + A(4 * batch_counter), + A(4 * (batch_counter + 1)), + A(4 * (batch_counter + 2)), + A(4 * (batch_counter + 3)) + ] + ); + + for (i, mut a_elem) in a.as_array_mut().iter_mut().enumerate() { + a_elem.0 += b.as_array()[i].0 + c.as_array()[i].0 + d.as_array()[i].0; + } + + for ticks in a.ticks.component_ticks.as_array().iter() { + assert_eq!(ticks.added, 1); + assert_eq!(ticks.changed, 2); + } + + batch_counter += 4; + }, + ); + + assert_eq!(scalar_counter, 3); + assert_eq!(batch_counter, 124); + } + fn system_check(mut q: Query<&A>) { + let mut scalar_counter = 0; + let mut batch_counter = 0; + + q.for_each_batched::<4, 16>( + |a| { + assert_eq!(*a, A(1990 + 16 * scalar_counter)); + + scalar_counter += 1; + }, + |a| { + assert_eq!( + *a.as_array(), + [ + A(16 * batch_counter + 6), + A(16 * (batch_counter + 1) + 6), + A(16 * (batch_counter + 2) + 6), + A(16 * (batch_counter + 3) + 6) + ] + ); + + batch_counter += 4; + }, + ); + } + + world.increment_change_tick(); + + let mut system_compute = IntoSystem::into_system(system_compute); + system_compute.initialize(&mut world); + system_compute.run((), &mut world); + + let mut system_check = IntoSystem::into_system(system_check); + system_check.initialize(&mut world); + system_check.run((), &mut world); + } + + #[test] + fn batched_queries_zst() { + let mut world = World::new(); + + world.spawn_batch((0..127).into_iter().map(|_| E)); + + fn system_compute(mut q: Query<&mut E>) { + q.for_each_mut_batched::<4, 16>(|mut e| *e = E, |mut e| e.as_array_mut()[1] = E); + } + + let mut system_compute = IntoSystem::into_system(system_compute); + system_compute.initialize(&mut world); + system_compute.run((), &mut world); + } + #[test] fn mut_to_immut_query_methods_have_immut_item() { #[derive(Component)] diff --git a/crates/bevy_ecs/src/query/state.rs b/crates/bevy_ecs/src/query/state.rs index 18874fd107fae5..1ab602e83f5fee 100644 --- a/crates/bevy_ecs/src/query/state.rs +++ b/crates/bevy_ecs/src/query/state.rs @@ -3,19 +3,24 @@ use crate::{ component::ComponentId, entity::Entity, prelude::FromWorld, + ptr::elain::{Align, Alignment}, query::{ Access, DebugCheckedUnwrap, FilteredAccess, QueryCombinationIter, QueryIter, WorldQuery, }, storage::TableId, world::{World, WorldId}, }; +use bevy_ptr::ThinSimdAlignedSlicePtr; use bevy_tasks::ComputeTaskPool; #[cfg(feature = "trace")] use bevy_utils::tracing::Instrument; use fixedbitset::FixedBitSet; use std::{borrow::Borrow, fmt, mem::MaybeUninit}; -use super::{NopWorldQuery, QueryManyIter, ROQueryItem, ReadOnlyWorldQuery}; +use super::{ + NopWorldQuery, QueryBatch, QueryItem, QueryManyIter, ROQueryBatch, ROQueryItem, + ReadOnlyWorldQuery, WorldQueryBatch, +}; /// Provides scoped access to a [`World`] state according to a given [`WorldQuery`] and query filter. #[repr(C)] @@ -792,6 +797,30 @@ impl QueryState { } } + /// A read-only version of [`for_each_mut_batched`](Self::for_each_mut_batched). Detailed docs can be found there regarding how to use this function. + #[inline] + pub fn for_each_batched<'w, const N: usize, const MIN_ALIGN: usize>( + &'w mut self, + world: &'w mut World, + func: impl FnMut(ROQueryItem<'w, Q>), + func_batch: impl FnMut(ROQueryBatch<'w, Q, N, MIN_ALIGN>), + ) where + ::ReadOnly: WorldQueryBatch, + Align: Alignment, + { + // SAFETY: query has unique world access + unsafe { + self.update_archetypes(world); + self.as_readonly().for_each_unchecked_manual_batched( + world, + func, + func_batch, + world.last_change_tick(), + world.read_change_tick(), + ); + } + } + /// Runs `func` on each query result for the given [`World`]. This is faster than the equivalent /// `iter_mut()` method, but cannot be chained like a normal [`Iterator`]. #[inline] @@ -807,6 +836,46 @@ impl QueryState { ); } } + /// This is a batched version of [`for_each_mut`](Self::for_each_mut) that accepts a batch size `N` together with a desired alignment for the batches `ALIGN`. + /// The advantage of using batching in queries is that it enables SIMD acceleration of your code to help you meet your performance goals. + /// This function accepts two arguments, `func`, and `func_batch` which represent the "scalar" and "vector" (or "batched") paths of your code respectively. + /// + /// ## Usage: + /// + /// * Supported values of `ALIGN` are 16, 32, and 64 + /// * `N` must be a power of 2 + /// * `func_batch` receives components in "batches" that are aligned to `ALIGN`. + /// * `func` functions exactly as does in [`for_each_mut`](Self::for_each_mut) -- it receives "scalar" (non-batched) components. + /// + /// In other words, `func_batch` composes the "fast path" of your query, and `func` is the "slow path". + /// + /// Batches are currently one of [`AlignedBatch16`](bevy_ptr::batch::AlignedBatch16), [`AlignedBatch32`](bevy_ptr::batch::AlignedBatch32), + /// or [`AlignedBatch64`](bevy_ptr::batch::AlignedBatch64) types, each corresponding to a guaranteed batch alignment. + /// The batch alignment is important as it enables architecture-specific optimizations that depend on alignment. + /// + /// See [`Query::for_each_mut_batched`](crate::system::Query::for_each_mut_batched) for a complete example of how to use this function. + #[inline] + pub fn for_each_mut_batched<'w, const N: usize, const MIN_ALIGN: usize>( + &'w mut self, + world: &'w mut World, + func: impl FnMut(QueryItem<'w, Q>), + func_batch: impl FnMut(QueryBatch<'w, Q, N, MIN_ALIGN>), + ) where + Q: WorldQueryBatch, + Align: Alignment, + { + // SAFETY: query has unique world access + unsafe { + self.update_archetypes(world); + self.for_each_unchecked_manual_batched( + world, + func, + func_batch, + world.last_change_tick(), + world.read_change_tick(), + ); + } + } /// Runs `func` on each query result for the given [`World`]. This is faster than the equivalent /// iter() method, but cannot be chained like a normal [`Iterator`]. @@ -945,7 +1014,7 @@ impl QueryState { let entities = table.entities(); for row in 0..table.entity_count() { - let entity = entities.get_unchecked(row); + let entity = entities.get(row); if !F::filter_fetch(&mut filter, *entity, row) { continue; } @@ -980,6 +1049,146 @@ impl QueryState { } } + //TODO: allow differing batch alignments... right now everything is forced to `ALIGN`, but + //it is actually possible to offer batches of different components at different alignments + //when generalized const expresions are stable. I.e,the following: + // gcd::euclid_usize(crate::ptr::batch::MAX_SIMD_ALIGNMENT, N * core::mem::size_of::()); + pub(crate) unsafe fn for_each_unchecked_manual_batched< + 'w, + const N: usize, + const MIN_ALIGN: usize, + FN: FnMut(QueryItem<'w, Q>), + FnBatch: FnMut(QueryBatch<'w, Q, N, MIN_ALIGN>), + >( + &self, + world: &'w World, + mut func: FN, + mut func_batch: FnBatch, + last_change_tick: u32, + change_tick: u32, + ) where + Q: WorldQueryBatch, + Align: Alignment, + { + // NOTE: If you are changing query iteration code, remember to update the following places, where relevant: + // QueryIter, QueryIterationCursor, QueryManyIter, QueryCombinationIter, QueryState::for_each_unchecked_manual, QueryState::par_for_each_unchecked_manual + let mut fetch = Q::init_fetch(world, &self.fetch_state, last_change_tick, change_tick); + let mut filter = F::init_fetch(world, &self.filter_state, last_change_tick, change_tick); + + //Can't use this because it captures a mutable reference to fetch and filter + + let serial_portion = |entities: ThinSimdAlignedSlicePtr<'w, Entity>, + fetch: &mut Q::Fetch<'w>, + filter: &mut F::Fetch<'w>, + func: &mut FN, + range| { + for table_index in range { + let entity = entities.get(table_index); + if !F::filter_fetch(filter, *entity, table_index) { + continue; + } + let item = Q::fetch(fetch, *entity, table_index); + func(item); + } + }; + + let tables = &world.storages().tables; + if Q::IS_DENSE && F::IS_DENSE { + for table_id in &self.matched_table_ids { + let table = &tables[*table_id]; + let entities = table.entities(); + Q::set_table(&mut fetch, &self.fetch_state, table); + F::set_table(&mut filter, &self.filter_state, table); + + let mut table_index = 0; + + //ALIGNED PORTION: + + //table_index = prologue_end; + + let batch_end = table.batchable_region_end::(); + + while table_index < batch_end { + //TODO PERF: since both the Query and the Filter are dense, can this be precomputed? + //NOTE: if F = (), this optimizes right out, so don't worry about performance in that case. + let mut unbatchable = None; + for i in 0..N { + let table_row = table_index + i; + let entity = entities.get(table_row); + + if !F::filter_fetch(&mut filter, *entity, table_row) { + //Cannot do a full batch, fallback to scalar. + //Already checked the filter against everything up until now. + //Therefore, do an *unchecked* serial portion. + for p in table_index..table_row { + let entity = entities.get(p); + let item = Q::fetch(&mut fetch, *entity, p); + func(item); + } + + //Handle the rest after + unbatchable = Some(table_row..table_index + N); + break; + } + } + + if let Some(rest) = unbatchable { + serial_portion(entities, &mut fetch, &mut filter, &mut func, rest); + } else { + //TODO: assume likely/hot path + let aligned_entity_batch = + entities.get_batch_aligned(table_index, table.entity_count()); + + let batch = Q::fetch_batched( + &mut fetch, + aligned_entity_batch, + table_index, + table.entity_count(), + ); + func_batch(batch); + } + + table_index += N; + } + + //EPILOGUE: + serial_portion( + entities, + &mut fetch, + &mut filter, + &mut func, + batch_end..table.entity_count(), + ); + } + } else { + //TODO: accelerate with batching, but first need to figure out if it's worth trying to batch sparse queries + let archetypes = &world.archetypes; + for archetype_id in &self.matched_archetype_ids { + let archetype = archetypes.get(*archetype_id).debug_checked_unwrap(); + let table = tables.get(archetype.table_id()).debug_checked_unwrap(); + Q::set_archetype(&mut fetch, &self.fetch_state, archetype, table); + F::set_archetype(&mut filter, &self.filter_state, archetype, table); + + let entities = archetype.entities(); + for idx in 0..archetype.len() { + let archetype_entity = entities.get_unchecked(idx); + if !F::filter_fetch( + &mut filter, + archetype_entity.entity, + archetype_entity.table_row, + ) { + continue; + } + func(Q::fetch( + &mut fetch, + archetype_entity.entity, + archetype_entity.table_row, + )); + } + } + } + } + /// Runs `func` on each query result in parallel for the given [`World`], where the last change and /// the current change tick are given. This is faster than the equivalent /// iter() method, but cannot be chained like a normal [`Iterator`]. @@ -1039,7 +1248,7 @@ impl QueryState { Q::set_table(&mut fetch, &self.fetch_state, table); F::set_table(&mut filter, &self.filter_state, table); for row in offset..offset + len { - let entity = entities.get_unchecked(row); + let entity = entities.get(row); if !F::filter_fetch(&mut filter, *entity, row) { continue; } diff --git a/crates/bevy_ecs/src/storage/aligned_vec.rs b/crates/bevy_ecs/src/storage/aligned_vec.rs new file mode 100644 index 00000000000000..fee0a0d9b14beb --- /dev/null +++ b/crates/bevy_ecs/src/storage/aligned_vec.rs @@ -0,0 +1,258 @@ +use core::alloc::Layout; +use core::borrow::{Borrow, BorrowMut}; +use core::marker::PhantomData; +use core::mem::needs_drop; +use core::ops::{Deref, DerefMut}; + +use core::cmp; +use core::slice::SliceIndex; + +use bevy_ptr::{OwningPtr, ThinSimdAlignedSlicePtr}; + +use super::blob_vec::BlobVec; + +/// A vector whose internal buffer is aligned to `MAX_SIMD_ALIGNMENT`. +/// Intended to support SIMD use cases. +/// +/// Used to densely store homogeneous ECS data whose type is known at compile time. +/// Built on `BlobVec`. It is not intended to be a drop-in replacement for Vec at this time. + +/* +NOTE: AlignedVec is ONLY implemented in terms of BlobVec because the Allocator API is not stable yet. +Once the Allocator API is stable, one could easily define AlignedVec as being a Vec with an allocator +that provides MAX_SIMD_ALIGNMENT as a guarantee, and remove almost all of the code in this file: + + type AlignedVec = Vec; + +As it stands, AlignedVec is a stand-in to provide just enough functionality to work for bevy_ecs. +*/ +pub(crate) struct SimdAlignedVec { + vec: BlobVec, + _marker: PhantomData, +} + +impl Default for SimdAlignedVec { + fn default() -> Self { + Self::new() + } +} + +impl std::fmt::Debug for SimdAlignedVec { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AlignedVec") + .field("vec", &self.vec) + .finish() + } +} + +impl SimdAlignedVec { + // SAFETY: The pointer points to a valid value of type `T` and it is safe to drop this value. + unsafe fn drop_ptr(x: OwningPtr<'_>) { + x.drop_as::(); + } + + pub fn with_capacity(capacity: usize) -> SimdAlignedVec { + Self { + // SAFETY: + // `drop` accurately reflects whether the contents of this Vec need to be dropped, and correctly performs the drop operation. + vec: unsafe { + BlobVec::new( + Layout::new::(), + needs_drop::().then_some(Self::drop_ptr as _), + capacity, + ) + }, + _marker: PhantomData, + } + } + + pub fn new() -> SimdAlignedVec { + Self::with_capacity(0) //Ensure a starting power-of-two capacity (for non-ZSTs) + } + + #[inline] + pub fn len(&self) -> usize { + self.vec.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.vec.len() == 0 + } + + #[inline] + pub fn capacity(&self) -> usize { + self.vec.capacity() + } + + /// # Safety + /// It is the caller's responsibility to ensure that `index` is < self.len() + #[inline] + pub unsafe fn get_unchecked(&self, index: usize) -> &>::Output { + debug_assert!(index < self.len()); + + self.vec.get_unchecked(index).deref() + } + + /// # Safety + /// It is the caller's responsibility to ensure that `index` is < self.len() + #[inline] + pub unsafe fn get_unchecked_mut( + &mut self, + index: usize, + ) -> &mut >::Output { + debug_assert!(index < self.len()); + + self.vec.get_unchecked_mut(index).deref_mut() + } + + //This function attempts to keep the same semantics as Vec's swap_remove function + pub fn swap_remove(&mut self, index: usize) -> T { + #[cold] + #[inline(never)] + fn assert_failed(index: usize, len: usize) -> ! { + panic!("swap_remove index (is {index}) should be < len (is {len})"); + } + let len = self.len(); + if index >= len { + assert_failed(index, len); + } + + // SAFETY: + // The index is guaranteed to be in bounds by this point. + unsafe { self.vec.swap_remove_and_forget_unchecked(index).read() } + } + + pub fn push(&mut self, value: T) { + // SAFETY: + // value is a valid owned instance of T, therefore it is safe to call push with it + OwningPtr::make(value, |ptr| unsafe { + self.vec.push(ptr); + }); + } + + pub fn reserve_exact(&mut self, additional: usize) { + self.vec.reserve_exact(additional); + } + + // From RawVec soruce code, for compatibility + const MIN_NON_ZERO_CAP: usize = if core::mem::size_of::() == 1 { + 8 + } else if core::mem::size_of::() <= 1024 { + 4 + } else { + 1 + }; + + //This function attempts to keep the same semantics as Vec's reserve function + pub fn reserve(&mut self, additional: usize) { + if core::mem::size_of::() == 0 { + // Since we return a capacity of `usize::MAX` when `elem_size` is + // 0, getting to here necessarily means the `AlignedVec` is overfull. + panic!("AlignedVec capacity overflow") + } + + // Nothing we can really do about these checks, sadly. + let required_cap = self.vec.len().checked_add(additional); + + if let Some(cap) = required_cap { + // This guarantees exponential growth. The doubling cannot overflow + // because `cap <= isize::MAX` and the type of `cap` is `usize`. + let cap = cmp::max(self.vec.capacity() * 2, cap); + let cap = cmp::max(Self::MIN_NON_ZERO_CAP, cap); + + self.reserve_exact(cap - self.vec.len()); + } else { + panic!("AlignedVec capacity overflow") + } + } + + pub fn clear(&mut self) { + self.vec.clear(); + } + + pub fn get_slice(&self) -> ThinSimdAlignedSlicePtr<'_, T> { + // SAFETY: + // The vector represents an array of T with appropriate alignment. + // The vector is borrowed with an shared reference, guaranteeing only other shared references exist. + // The returned ThinSimdAlignedSlicePtr does not permit mutation of the vector, unless T is `UnsafeCell`, + // in which case all standard aliasing rules apply and this is passed onto the user. + // Therefore, the aliasing guarantees are upheld. + unsafe { self.vec.get_slice().as_deref() } + } +} + +impl Borrow<[T]> for SimdAlignedVec { + fn borrow(&self) -> &[T] { + self + } +} + +impl BorrowMut<[T]> for SimdAlignedVec { + fn borrow_mut(&mut self) -> &mut [T] { + self + } +} + +impl AsRef<[T]> for SimdAlignedVec { + fn as_ref(&self) -> &[T] { + self + } +} + +impl AsMut<[T]> for SimdAlignedVec { + fn as_mut(&mut self) -> &mut [T] { + self + } +} + +impl Deref for SimdAlignedVec { + type Target = [T]; + + #[inline] + fn deref(&self) -> &[T] { + // SAFETY: + // The vector represents an array of T with appropriate alignment. + // The vector is borrowed with an shared reference, guaranteeing only other shared references exist. + // Therefore, it is safe to provide a shared reference to its contents. + unsafe { + std::slice::from_raw_parts(self.vec.get_ptr().as_ptr() as *const T, self.vec.len()) + } + } +} + +impl DerefMut for SimdAlignedVec { + #[inline] + fn deref_mut(&mut self) -> &mut [T] { + // SAFETY: + // The vector represents an array of T with appropriate alignment. + // The vector is borrowed with a mutable reference, guaranteeing uniqueness. + // Therefore, it is safe to provide a mutable reference to its contents. + unsafe { + core::slice::from_raw_parts_mut( + self.vec.get_ptr_mut().as_ptr() as *mut T, + self.vec.len(), + ) + } + } +} + +impl<'a, T> IntoIterator for &'a mut SimdAlignedVec { + type Item = <&'a mut [T] as IntoIterator>::Item; + + type IntoIter = <&'a mut [T] as IntoIterator>::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.as_mut().iter_mut() + } +} + +impl<'a, T> IntoIterator for &'a SimdAlignedVec { + type Item = <&'a [T] as IntoIterator>::Item; + + type IntoIter = <&'a [T] as IntoIterator>::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.as_ref().iter() + } +} diff --git a/crates/bevy_ecs/src/storage/blob_vec.rs b/crates/bevy_ecs/src/storage/blob_vec.rs index 384c26a07dc7af..2d610cc474c2a1 100644 --- a/crates/bevy_ecs/src/storage/blob_vec.rs +++ b/crates/bevy_ecs/src/storage/blob_vec.rs @@ -5,7 +5,9 @@ use std::{ ptr::NonNull, }; -use bevy_ptr::{OwningPtr, Ptr, PtrMut}; +use bevy_ptr::batch::MAX_SIMD_ALIGNMENT; + +use crate::ptr::{batch, OwningPtr, Ptr, PtrMut, ThinSimdAlignedSlicePtr}; /// A flat, type-erased data storage type /// @@ -49,10 +51,23 @@ impl BlobVec { drop: Option)>, capacity: usize, ) -> BlobVec { + /*NOTE: Dangling pointers still need to be well aligned for the type when using slices (even though they are 0-length). + This is important for [`SimdAlignedVec`] and any function that would return a slice view of this BlobVec. + + Since neither strict_provenance nor alloc_layout_extra is stable, there is no way to construct a NonNull::dangling() + pointer from `item_layout` without using a pointer cast. This requires `-Zmiri-permissive-provenance` when testing, + otherwise Miri will issue a warning. + + TODO: Rewrite this when strict_provenance or alloc_layout_extra is stable. + */ + + let dangling = + NonNull::new(item_layout.align().max(MAX_SIMD_ALIGNMENT) as *mut u8).unwrap(); + if item_layout.size() == 0 { BlobVec { - swap_scratch: NonNull::dangling(), - data: NonNull::dangling(), + swap_scratch: dangling, + data: dangling, capacity: usize::MAX, len: 0, item_layout, @@ -63,7 +78,7 @@ impl BlobVec { .unwrap_or_else(|| std::alloc::handle_alloc_error(item_layout)); let mut blob_vec = BlobVec { swap_scratch, - data: NonNull::dangling(), + data: dangling, capacity: 0, len: 0, item_layout, @@ -300,13 +315,13 @@ impl BlobVec { unsafe { PtrMut::new(self.data) } } - /// Get a reference to the entire [`BlobVec`] as if it were an array with elements of type `T` + /// Get a reference to the entire [`BlobVec`] as if it were an array with elements of type `T`. /// /// # Safety /// The type `T` must be the type of the items in this [`BlobVec`]. - pub unsafe fn get_slice(&self) -> &[UnsafeCell] { + pub unsafe fn get_slice(&self) -> ThinSimdAlignedSlicePtr> { // SAFETY: the inner data will remain valid for as long as 'self. - std::slice::from_raw_parts(self.data.as_ptr() as *const UnsafeCell, self.len) + ThinSimdAlignedSlicePtr::new(self.data.as_ptr() as *mut UnsafeCell, self.len) } pub fn clear(&mut self) { @@ -353,6 +368,9 @@ impl Drop for BlobVec { fn array_layout(layout: &Layout, n: usize) -> Option { let (array_layout, offset) = repeat_layout(layout, n)?; debug_assert_eq!(layout.size(), offset); + + //Note: NEEDED for batching. This is the layout of the array itself, not the layout of its elements. + let array_layout = array_layout.align_to(batch::MAX_SIMD_ALIGNMENT).unwrap(); Some(array_layout) } diff --git a/crates/bevy_ecs/src/storage/mod.rs b/crates/bevy_ecs/src/storage/mod.rs index 6e848a042b492c..d894197f8e7ee7 100644 --- a/crates/bevy_ecs/src/storage/mod.rs +++ b/crates/bevy_ecs/src/storage/mod.rs @@ -1,5 +1,6 @@ //! Storage layouts for ECS data. +pub(super) mod aligned_vec; mod blob_vec; mod resource; mod sparse_set; diff --git a/crates/bevy_ecs/src/storage/sparse_set.rs b/crates/bevy_ecs/src/storage/sparse_set.rs index fdb9a21176a5ff..3988180dc39701 100644 --- a/crates/bevy_ecs/src/storage/sparse_set.rs +++ b/crates/bevy_ecs/src/storage/sparse_set.rs @@ -1,6 +1,7 @@ use crate::{ component::{ComponentId, ComponentInfo, ComponentTicks}, entity::Entity, + storage::aligned_vec::SimdAlignedVec, storage::Column, }; use bevy_ptr::{OwningPtr, Ptr}; @@ -244,8 +245,8 @@ impl ComponentSparseSet { /// `I` is the type of the indices, while `V` is the type of data stored in the dense storage. #[derive(Debug)] pub struct SparseSet { - dense: Vec, - indices: Vec, + dense: SimdAlignedVec, + indices: SimdAlignedVec, sparse: SparseArray, } @@ -255,10 +256,10 @@ impl Default for SparseSet { } } impl SparseSet { - pub const fn new() -> Self { + pub fn new() -> Self { Self { - dense: Vec::new(), - indices: Vec::new(), + dense: SimdAlignedVec::new(), + indices: SimdAlignedVec::new(), sparse: SparseArray::new(), } } @@ -267,8 +268,8 @@ impl SparseSet { impl SparseSet { pub fn with_capacity(capacity: usize) -> Self { Self { - dense: Vec::with_capacity(capacity), - indices: Vec::with_capacity(capacity), + dense: SimdAlignedVec::with_capacity(capacity), + indices: SimdAlignedVec::with_capacity(capacity), sparse: Default::default(), } } diff --git a/crates/bevy_ecs/src/storage/table.rs b/crates/bevy_ecs/src/storage/table.rs index adb684469a6d6c..57a1004aff5059 100644 --- a/crates/bevy_ecs/src/storage/table.rs +++ b/crates/bevy_ecs/src/storage/table.rs @@ -2,9 +2,9 @@ use crate::{ component::{ComponentId, ComponentInfo, ComponentTicks, Components}, entity::Entity, query::DebugCheckedUnwrap, - storage::{blob_vec::BlobVec, SparseSet}, + storage::{aligned_vec::SimdAlignedVec, blob_vec::BlobVec, SparseSet}, }; -use bevy_ptr::{OwningPtr, Ptr, PtrMut}; +use bevy_ptr::{OwningPtr, Ptr, PtrMut, ThinSimdAlignedSlicePtr}; use bevy_utils::HashMap; use std::alloc::Layout; use std::{ @@ -35,7 +35,7 @@ impl TableId { #[derive(Debug)] pub struct Column { data: BlobVec, - ticks: Vec>, + ticks: SimdAlignedVec>, } impl Column { @@ -44,7 +44,7 @@ impl Column { Column { // SAFETY: component_info.drop() is valid for the types that will be inserted. data: unsafe { BlobVec::new(component_info.layout(), component_info.drop(), capacity) }, - ticks: Vec::with_capacity(capacity), + ticks: SimdAlignedVec::with_capacity(capacity), } } @@ -187,13 +187,13 @@ impl Column { /// # Safety /// The type `T` must be the type of the items in this column. - pub unsafe fn get_data_slice(&self) -> &[UnsafeCell] { + pub unsafe fn get_data_slice(&self) -> ThinSimdAlignedSlicePtr> { self.data.get_slice() } #[inline] - pub fn get_ticks_slice(&self) -> &[UnsafeCell] { - &self.ticks + pub fn get_ticks_slice(&self) -> ThinSimdAlignedSlicePtr> { + self.ticks.get_slice() } #[inline] @@ -264,20 +264,20 @@ impl Column { pub struct Table { columns: SparseSet, - entities: Vec, + entities: SimdAlignedVec, } impl Table { pub(crate) fn with_capacity(capacity: usize, column_capacity: usize) -> Table { Self { columns: SparseSet::with_capacity(column_capacity), - entities: Vec::with_capacity(capacity), + entities: SimdAlignedVec::with_capacity(capacity), } } #[inline] - pub fn entities(&self) -> &[Entity] { - &self.entities + pub fn entities(&self) -> ThinSimdAlignedSlicePtr<'_, Entity> { + self.entities.get_slice() } pub(crate) fn add_column(&mut self, component_info: &ComponentInfo) { @@ -462,6 +462,20 @@ impl Table { self.columns.capacity() } + #[inline] + pub fn batchable_region_end(&self) -> usize { + //Critical invariant: each Component storage is aligned to MAX_SIMD_ALIGNMENT + //and each component in the query can be batched e.g., for (Q1, Q2), both Q1 and Q2 issue aligned batches + //Therefore, for the given query, the batch size of N is valid. + + //The components are divided into [batchable region][scalar region] + //Given the above invariants, the euclidian division of table.len() = bN + s, 0 <= s < N, gives + //b batches with a scalar region of s + //Therefore, the batch region of indices is from [0, bN), and the scalar is [bN, table.len()) + + (self.entity_count() / N) * N + } + #[inline] pub fn is_empty(&self) -> bool { self.entities.is_empty() diff --git a/crates/bevy_ecs/src/system/query.rs b/crates/bevy_ecs/src/system/query.rs index 8f7e4f70bc3035..88c11c89117e67 100644 --- a/crates/bevy_ecs/src/system/query.rs +++ b/crates/bevy_ecs/src/system/query.rs @@ -1,9 +1,11 @@ use crate::{ component::Component, entity::Entity, + ptr::elain::{Align, Alignment}, query::{ - QueryCombinationIter, QueryEntityError, QueryIter, QueryManyIter, QuerySingleError, - QueryState, ROQueryItem, ReadOnlyWorldQuery, WorldQuery, + QueryBatch, QueryCombinationIter, QueryEntityError, QueryItem, QueryIter, QueryManyIter, + QuerySingleError, QueryState, ROQueryBatch, ROQueryItem, ReadOnlyWorldQuery, WorldQuery, + WorldQueryBatch, }, world::{Mut, World}, }; @@ -687,6 +689,29 @@ impl<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> Query<'w, 's, Q, F> { }; } + /// See [`QueryState::for_each_batched`](QueryState::for_each_batched) for how to use this function. + #[inline] + pub fn for_each_batched<'a, const N: usize, const MIN_ALIGN: usize>( + &'a mut self, + func: impl FnMut(ROQueryItem<'a, Q>), + func_batch: impl FnMut(ROQueryBatch<'a, Q, N, MIN_ALIGN>), + ) where + ::ReadOnly: WorldQueryBatch, + Align: Alignment, + { + // SAFETY: system runs without conflicts with other systems. same-system queries have runtime + // borrow checks when they conflict + unsafe { + self.state.as_readonly().for_each_unchecked_manual_batched( + self.world, + func, + func_batch, + self.last_change_tick, + self.change_tick, + ); + }; + } + /// Runs `f` on each query item. /// /// # Example @@ -725,6 +750,231 @@ impl<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> Query<'w, 's, Q, F> { }; } + /// This is a "batched" version of [`for_each_mut`](Self::for_each_mut) that accepts a batch size `N` together with a desired alignment for the batches `ALIGN`. + /// The advantage of using batching in queries is that it enables SIMD acceleration (vectorization) of your code to help you meet your performance goals. + /// This function accepts two arguments, `func`, and `func_batch` which represent the "scalar" and "vector" (or "batched") paths of your code respectively. + /// Each "batch" contains `N` query results, in order, with a guaranteed alignment for the batch to aid in vectorization of the query. + /// + /// # A very brief introduction to SIMD + /// + /// SIMD, or Single Instruction, Multiple Data, is a paradigm that allows a single instruction to operate on multiple datums in parallel. + /// It is most commonly seen in "vector" instruction set extensions such as AVX and NEON, where it is possible to, for example, add + /// two arrays of `[f32; 4]` together in a single instruction. When used appropriately, SIMD is a very powerful tool that can greatly accelerate certain types of workloads. + /// An introductory treatment of SIMD can be found [on Wikipedia](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) for interested readers. + /// + /// [Vectorization](https://stackoverflow.com/questions/1422149/what-is-vectorization) is an informal term to describe optimizing code to leverage these SIMD instruction sets. + /// + /// # Just what is this alignment thing, anyway? + /// + /// [This chapter](https://doc.rust-lang.org/reference/type-layout.html) of the Rust reference is a great treatment on alignment. + /// + /// Vector instructions often come with memory operand alignment restrictions that make their usage more complicated, and these + /// are typically based on the vector size in bytes. For example, a `[f32; 4]` vector is 16 bytes long, and in SSE4 some instructions + /// taking a `[f32; 4]` memory operand require it to be 16 bytes aligned. "Unaligned moves" are possible, but may carry a performance + /// penalty under certain circumstances. If the compiler can't prove that a memory operand will be aligned appropriately, it must assume + /// the worst case and emit code expecting unaligned data. + /// + /// Fortunately, you can provide a guaranteed batch alignment using the `ALIGN` parameter. This ensures that your batch + /// alignment is at least `ALIGN` (a compile-time assertion ensures if this is not possible, the build cannot succeed). + /// The result is that the Rust compiler can directly see that your reads and writes are aligned appropriately + /// and enable usage of vector instructions with aligned memory operands. That is, Bevy proves that memory operands will be aligned to at least `ALIGN`, + /// enabling greater optimization potential, and also helping you if you choose to write SIMD intrinsics directly. + /// + /// When generalized const expressions are stable, the guaranteed alignment of your batch will be calculated for you automatically + /// and you won't need to pass in the `ALIGN` parameter. + /// + /// # When should I consider batching for my query? + /// + /// The first thing you should consider is if you are meeting your performance goals. Batching a query is fundamentally an optimization, and if your application is meeting performance requirements + /// already, then (other than for your own entertainment) you won't get much benefit out of batching. If you are having performance problems though, the next step is to + /// use a [profiler](https://nnethercote.github.io/perf-book/profiling.html) to determine the running characteristics of your code. + /// If, after profiling your code, you have determined that a substantial amount of time is being processing a query, and it's hindering your performance goals, + /// then it might be worth it to consider batching to meet them. + /// + /// One of the main tradeoffs with batching your queries is that there will be an increased complexity from maintaining both code paths: `func` and `func_batch` + /// semantically should be doing the same thing, and it should always be possible to interchange them without visible program effects. + /// + /// # What kinds of queries make sense to batch? + /// + /// Usually math related ones: an example is given below showing how to accelerate a simple `position += velocity * time` calculation using batching. + /// Anything involving floats is a possible candidate. Depending on your component layout, you may need to perform a data layout conversion + /// to batch the query optimally. This Wikipedia page on ["array of struct" and "struct of array" layouts](https://en.wikipedia.org/wiki/AoS_and_SoA) is a good starter on + /// this topic, as is this [Intel blog post](https://www.intel.com/content/www/us/en/developer/articles/technical/memory-layout-transformations.html). + /// The example below uses data layout conversion. + /// + /// Vectorizing code can be a very deep subject to get into. + /// Sometimes it can be very straightfoward to accomplish what you want to do, and other times it takes a bit of playing around to make your problem fit the SIMD model. + /// + /// # Will batching always make my queries faster? + /// + /// Unfortunately it will not. A suboptimally written batched query will probably perform worse than a straightforward `for_each_mut` query. Data layout conversion, + /// for example, carries overhead that may not always be worth it. Fortunately, your profiler can help you identify these situations. + /// + /// Think of batching as a tool in your performance toolbox rather than the preferred way of writing your queries. + /// + /// # What kinds of queries are batched right now? + /// + /// Currently, only "Dense" queries are actually batched; other queries will only use `func` and never call `func_batch`. This will improve + /// in the future. + /// + /// # Usage: + /// + /// * Supported values of `ALIGN` are 16, 32, and 64 + /// * `N` must be a power of 2 + /// * `func_batch` receives components in "batches" that are aligned to `ALIGN`. + /// * `func` functions exactly as does in [`for_each_mut`](Self::for_each_mut) -- it receives "scalar" (non-batched) components. + /// + /// In other words, `func_batch` composes the "fast path" of your query, and `func` is the "slow path". + /// + /// Batches are currently one of [`AlignedBatch16`](bevy_ptr::batch::AlignedBatch16), [`AlignedBatch32`](bevy_ptr::batch::AlignedBatch32), + /// or [`AlignedBatch64`](bevy_ptr::batch::AlignedBatch64) types, each corresponding to a guaranteed batch alignment. + /// The batch alignment is important as it enables architecture-specific optimizations that depend on alignment. + /// + /// If you attempt to call this function with an invalid generic argument, a compile-time error will be issued. For example, + /// if you request 64-bytes alignment on a batch whose size is 8 bytes, your program will fail to compile. In general, + /// choose `ALIGN` as large as you can for your batch parameters. The minimum alignment supported for batches is 16 bytes. + /// + /// **Note**: Once generalized const expressions are stable, it will be possible to compute `ALIGN` automatically based on the component type and batch size. + /// Compile-time assertions will still exist for sanity checking. + /// + /// In general, when using this function, be mindful of the types of filters being used with your query, as these can fragment your batches + /// and cause the scalar path to be taken more often. + /// + /// **Note**: It is always valid for the implementation of this function to only call `func`. Currently, batching is only supported for "Dense" queries. + /// Calling this function on any other query type will result in only the slow path being executed (e.g., queries with Sparse components.) + /// More query types may become batchable in the future. + /// + /// **Note**: Although this function provides the groundwork for writing performance-portable SIMD code, you will still need to take into account + /// your target architecture's capabilities. The batch size will likely need to be tuned for your application, for example. + /// When SIMD becomes stabilized in Rust, it will be possible to write code that is generic over the batch width, but some degree of tuning will likely always be + /// necessary. Think of this as a tool at your disposal to meet your performance goals. + /// + /// # Example: Acclerate a simple "`Position += Velocity * time`" calculation using SIMD, where Position and Velocity are represented using `Vec3`s. + /// A batch width of 4 is chosen to match SSE4 and allow the usage of the `Vec4` type to perform SIMD. This example uses the [`as_inner`](bevy_ptr::batch::AlignedBatch16::as_inner) + /// functions to easily and efficiently process components with a single member. + /// + /// ```rust + /// use bevy_ecs::prelude::*; + /// use bevy_ptr::batch::AlignedBatch; + /// use bevy_ptr::bytemuck::TransparentWrapper; + /// use bevy_math::{Vec3,Vec4}; + /// + /// #[derive(Clone, Copy, Component, PartialEq, Debug)] + /// //We want repr(transparent) here to access the `as_inner` AlignedBatch functions. + /// #[repr(transparent)] + /// struct Position(Vec3); + /// + /// //Important: this allows us to "cast away" the outer layer of Position + /// // SAFETY: Position is repr(transparent) and contains a Vec3 + /// unsafe impl TransparentWrapper for Position {} + /// + /// #[derive(Clone, Copy, Component, PartialEq, Debug)] + /// //We want repr(transparent) here to access the `as_inner` AlignedBatch functions. + /// #[repr(transparent)] + /// struct Velocity(Vec3); + /// + /// //Important: this allows us to "cast away" the outer layer of Velocity + /// // SAFETY: Velocity is repr(transparent) and contains a Vec3 + /// unsafe impl TransparentWrapper for Velocity {} + /// + /// //Convert the AoS representation to an SoA representation amenable to SIMD operations + /// //This uses the `AsRef` trait to generically deal with different alignments + /// //on size 4 batches. + /// fn aos_to_soa(aos: &impl AsRef<[Vec3; 4]>) -> [Vec4; 3] + /// { + /// let [p0, p1, p2, p3] = aos.as_ref(); + /// + /// let xs = Vec4::new(p0.x, p1.x, p2.x, p3.x); + /// let ys = Vec4::new(p0.y, p1.y, p2.y, p3.y); + /// let zs = Vec4::new(p0.z, p1.z, p2.z, p3.z); + /// + /// [xs, ys, zs] + /// } + /// + /// //Convert the SoA representation back to AoS for storing back into the ECS. + /// fn soa_to_aos(aos: &[Vec4; 3]) -> [Vec3; 4] + /// { + /// let [xs, ys, zs] = aos; + /// + /// let p0 = Vec3::new(xs.x, ys.x, zs.x); + /// let p1 = Vec3::new(xs.y, ys.y, zs.y); + /// let p2 = Vec3::new(xs.z, ys.z, zs.z); + /// let p3 = Vec3::new(xs.w, ys.w, zs.w); + /// + /// [p0, p1, p2, p3] + /// } + /// + /// const DELTA_TIMESTEP: f32 = 1.0/60.0; + /// + /// fn position_update_system(mut query: Query<(&mut Position, &Velocity)>) + /// { + /// //Execute this query in batches of 4 components with an alignment of 16. + /// //Note that if this is not possible, the program will fail to compile. + /// //Try changing "16" to "32" here, for example, and observe what happens. + /// //In the future, the alignment will able to be computed for you when + /// //generic const expressions are stable. + /// query.for_each_mut_batched::<4, 16>(|(mut position, velocity)| + /// { + /// //The scalar path -- this is only executed for the epilogue or for when + /// //filters are present that would fragment the query + /// position.0 += DELTA_TIMESTEP * velocity.0; + /// }, + /// |(mut position, velocity)| + /// { + /// //The batched path -- `position` and `velocity` are now AlignedBatch types. + /// //This uses SIMD to perform the calculation. + /// //Note that this example is designed around SSE4. + /// //For AVX2, for example, you would want batches of 8 `f32`s. + /// //When SIMD is stabilized in rust, it could be replaced with generic vector-width code. + /// + /// //NOTE: change trackers currently will add overhead. + /// //If this is a problem, uncomment the below: + /// //let position = position.bypass_change_detection(); + /// + /// //This lets us treat our batch of Position as a batch of Vec3 + /// let mut ps = position.as_inner_mut::(); + /// + /// // Turn [Vec3; 4] into [Vec4; 3] form for SIMD acceleration + /// let [ps_x, ps_y, ps_z] = aos_to_soa(ps); + /// let [vs_x, vs_y, vs_z] = aos_to_soa(velocity.as_inner::()); + /// //NOTE: the above can also be achieved similarly using velocity.map(|c| c.0) + /// + /// //Now that we have our [Vec4; 3], this line will optimize well using SIMD + /// let newps = [ps_x + vs_x * DELTA_TIMESTEP, + /// ps_y + vs_y * DELTA_TIMESTEP, + /// ps_z + vs_z * DELTA_TIMESTEP]; + /// + /// //But our intermediate work is still in SoA representation! + /// //We still need to translate it back. + /// //"into" is used to convert the [Vec3; 4] into an AlignedBatch16. + /// //It *should* optimize right out. + /// *ps = soa_to_aos(&newps).into(); + /// }); + /// } + /// # bevy_ecs::system::assert_is_system(position_update_system); + /// ``` + #[inline] + pub fn for_each_mut_batched<'a, const N: usize, const MIN_ALIGN: usize>( + &'a mut self, + func: impl FnMut(QueryItem<'a, Q>), + func_batch: impl FnMut(QueryBatch<'a, Q, N, MIN_ALIGN>), + ) where + Q: WorldQueryBatch, + Align: Alignment, + { + // SAFETY: system runs without conflicts with other systems. same-system queries have runtime + // borrow checks when they conflict + unsafe { + self.state.for_each_unchecked_manual_batched( + self.world, + func, + func_batch, + self.last_change_tick, + self.change_tick, + ); + }; + } + /// Runs `f` on each read-only query item in parallel. /// /// Parallelization is achieved by using the [`World`]'s [`ComputeTaskPool`]. diff --git a/crates/bevy_ecs/src/world/world_cell.rs b/crates/bevy_ecs/src/world/world_cell.rs index 7679e51c407303..6c292ef29e3ed2 100644 --- a/crates/bevy_ecs/src/world/world_cell.rs +++ b/crates/bevy_ecs/src/world/world_cell.rs @@ -33,7 +33,7 @@ impl Default for ArchetypeComponentAccess { const UNIQUE_ACCESS: usize = 0; const BASE_ACCESS: usize = 1; impl ArchetypeComponentAccess { - const fn new() -> Self { + fn new() -> Self { Self { access: SparseSet::new(), } diff --git a/crates/bevy_ptr/Cargo.toml b/crates/bevy_ptr/Cargo.toml index 04b7e30fc9b5b4..671b8165c44abc 100644 --- a/crates/bevy_ptr/Cargo.toml +++ b/crates/bevy_ptr/Cargo.toml @@ -9,3 +9,6 @@ license = "MIT OR Apache-2.0" keywords = ["bevy", "no_std"] [dependencies] +elain = "0.3" +bytemuck = "1.12" +bevy_math = { path = "../bevy_math", version = "0.9.0-dev" } \ No newline at end of file diff --git a/crates/bevy_ptr/src/batch.rs b/crates/bevy_ptr/src/batch.rs new file mode 100644 index 00000000000000..c26ffe7cd07e0b --- /dev/null +++ b/crates/bevy_ptr/src/batch.rs @@ -0,0 +1,360 @@ +use crate::ThinSimdAlignedSlicePtr; + +use core::{ + cell::UnsafeCell, + marker::PhantomData, + ops::{Index, IndexMut}, +}; + +use crate::bytemuck; + +use elain::{Align, Alignment}; + +use bevy_math::Vec4; + +/* +NOTE: We define this constant here as both [`bevy_ptr`] and [`bevy_ecs`] need to know about it. + +If this is a problem, this can be replaced with code that looks something like the following: + + #[cfg(all(any(target_feature = "avx"), not(target_feature = "avx512f")))] + pub const MAX_SIMD_ALIGNMENT: usize = 32; + + #[cfg(any(target_feature = "avx512f"))] + pub const MAX_SIMD_ALIGNMENT: usize = 64; + + //All platforms get 16-byte alignment on tables guaranteed. + #[cfg(not(any(target_feature = "avx512f")))] + pub const MAX_SIMD_ALIGNMENT: usize = 16; +*/ +/// The maximum SIMD alignment for a given target. +/// `MAX_SIMD_ALIGNMENT` is 64 for the following reasons: +/// 1. This ensures that table columns are aligned to cache lines on x86 +/// 2. 64 is the maximum alignment required to use all instructions on all known CPU architectures. +/// This simplifies greatly handling cross platform alignment on a case by case basis; by aligning to the worst case, we align for all cases +/// 3. The overhead of aligning columns to 64 bytes is very small as columns will in general be much larger than this +pub const MAX_SIMD_ALIGNMENT: usize = 64; + +//TODO: AoSoA representations + +//TODO: when possible, compute alignments automatically using the GCD (requires generic const expressions) and ensure the +//batch lambda accepts arguments of AlignedBatchTrait directly. This will let the alignment be automatically computed +//and allow different query elements to have different alignments. + +//FIXME: reword for struct def +/// This trait allows generic code using [`AlignedBatchT`] with varying sizes and alignments +/// to call methods on the batches. +/// This trait is sealed to allow for non-breaking changes in the future (such as when Rust SIMD is stabilized). +/// The sealed trait design pattern is documented in the [Rust book](https://rust-lang.github.io/api-guidelines/future-proofing.html). +// FIXME: what if align(T) > MIN_ALIGN? + +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct AlignedBatch +where + Align: Alignment, +{ + _align: Align, + batch: [T; N], +} + +impl AlignedBatch +where + Align: Alignment, +{ + //These only make sense when a component is #[repr(transparent)]. + //For example, if you had a repr(transparent) Position component that contained a Vec3, it + //would be semantically valid to get a reference to the inner component. + //The TransparentWrapper unsafe trait from the bytemuck crate is used to make this process usable in user code. + + /// If `T` is `repr(transparent)`, then `as_inner` can be used to get a shared reference to an "inner view" of the batch. + /// To use this, implement [`bytemuck::TransparentWrapper`] for your component. + /// For example, if you had a `repr(transparent)` `Position` component that contained a [`bevy_math::Vec3`], you could treat a batch of `Position` as a batch of [`bevy_math::Vec3`]. + #[inline] + pub fn as_inner(&self) -> &AlignedBatch + where + T: bytemuck::TransparentWrapper, + { + // SAFETY: + // + // * T is repr(transparent), with inner type Inner + // * $batch_type is repr(transparent) + // * $batch_type is repr(transparent) + // * Therefore $batch_type and $batch_type + // * Since self is a shared reference, creating more shared references to the same memory is OK. + unsafe { &*(self as *const Self as *const AlignedBatch) } + } + + /// If `T` is `repr(transparent)`, then `as_inner_mut` can be used to get a mutable reference to an "inner view" of the batch. + /// To use this, implement [`bytemuck::TransparentWrapper`] for your component. + /// For example, if you had a `repr(transparent)` `Position` component that contained a [`bevy_math::Vec3`], you could treat a batch of `Position` as a batch of [`bevy_math::Vec3`]. + #[inline] + pub fn as_inner_mut(&mut self) -> &mut AlignedBatch + where + T: bytemuck::TransparentWrapper, + { + // SAFETY: + // + // * Recommended pattern from the Rust book: https://doc.rust-lang.org/std/mem/fn.transmute.html + // * Section: "turning an &mut T into an &mut U" + // * T is repr(transparent), with inner type Inner + // * $batch_type is repr(transparent) + // * $batch_type is repr(transparent) + // * Therefore $batch_type and $batch_type + unsafe { &mut *(self as *mut Self as *mut AlignedBatch) } + } + + /// Constructs a new batch with the result of `func` mapped over the components of this batch. + //TODO: this doesn't optimize very well... + #[inline] + pub fn map U>(self, func: F) -> AlignedBatch { + AlignedBatch:: { + _align: self._align, + batch: self.batch.map(func), + } + } + + /// Retrieve a shared reference to this batch as an array of `[T; N]` + /// You can use this to destructure your batch into elements if needed. + #[inline] + pub fn as_array(&self) -> &[T; N] { + self.as_ref() + } + + /// Retrieve a mutable reference to this batch as an array of `[T; N]`. + /// You can use this to modify elements of your batch. + #[inline] + pub fn as_array_mut(&mut self) -> &mut [T; N] { + self.as_mut() + } + + /// Convert this batch into an array of `[T; N]`. + /// A convenience function, as all batches implement [`From`] and [`Into`] for `[T; N]`. + #[inline] + pub fn into_array(self) -> [T; N] { + self.into() + } + + //TODO: add support for as_simd()/into_simd() when SIMD is stabilized! +} + +impl From<[T; N]> for AlignedBatch +where + Align: Alignment, +{ + #[inline] + fn from(batch: [T; N]) -> Self { + Self { + _align: Align::NEW, + batch, + } + } +} + +impl From> for [T; N] +where + Align: Alignment, +{ + #[inline] + fn from(v: AlignedBatch) -> Self { + v.batch + } +} + +impl Index for AlignedBatch +where + Align: Alignment, +{ + type Output = T; + + #[inline] + fn index(&self, i: usize) -> &>::Output { + &self.batch[i] + } +} + +impl IndexMut for AlignedBatch +where + Align: Alignment, +{ + #[inline] + fn index_mut(&mut self, i: usize) -> &mut >::Output { + &mut self.batch[i] + } +} + +impl AsRef<[T; N]> for AlignedBatch +where + Align: Alignment, +{ + fn as_ref(&self) -> &[T; N] { + &self.batch + } +} + +impl AsMut<[T; N]> for AlignedBatch +where + Align: Alignment, +{ + fn as_mut(&mut self) -> &mut [T; N] { + &mut self.batch + } +} + +// TODO: when stable, replace `ALIGN` with an `Alignment` enum +// OR when general const expressions are stable, replace with a trait constraint `Alignment`. +// Do the same for batch sizes. For now, this is the best we can do. + +//Convenience impls that can go away when SIMD is stabilized +impl AsRef for AlignedBatch { + #[inline] + fn as_ref(&self) -> &Vec4 { + // SAFETY: + // * Alignment of Vec4 is 16 + // * Alignment of Self is 16 + // * Self is repr(C) and therefore can be treated as an [f32; 4] + // * Vec4 is repr(transparent) and can be treated as an [f32; 4] (it is an __mm128) + // * Only shared refs exist + // Therefore this cast is sound. + unsafe { &*(self as *const Self as *const Vec4) } + } +} + +impl AsMut for AlignedBatch { + #[inline] + fn as_mut(&mut self) -> &mut Vec4 { + // SAFETY: + // * Alignment of Vec4 is 16 + // * Alignment of Self is 16 + // * Self is repr(C) and therefore can be treated as an [f32; 4] + // * Vec4 is repr(transparent) and can be treated as an [f32; 4] (it is an __mm128) + // * &mut T to &mut U pattern used from the Rust book to ensure soundness when casting mutable refs + // Therefore this cast is sound. + unsafe { &mut *(self as *mut Self as *mut Vec4) } + } +} + +impl<'a, T> ThinSimdAlignedSlicePtr<'a, T> { + /// Indexes the slice without doing bounds checks with a batch size of `N`. + /// The batch size in bytes must be a multiple of `ALIGN`. + /// A compile-time error will be given if the alignment requirements cannot be met with the given parameters. + /// + /// # Safety + /// `index` must be in-bounds. + /// `index` must be a multiple of `N`. + #[inline] + unsafe fn get_batch_aligned_raw( + self, + index: usize, + _len: usize, + ) -> *const AlignedBatch + where + Align: Alignment, + { + //Can't use this method if the batch doesn't make sense (performance safety). + //STATIC ASSERTIONS. Rust has no support for these directly... + #[allow(clippy::let_unit_value)] + let _ = Assert::::YOUR_BATCH_SIZE_IS_NOT_A_MULTIPLE_OF_ALIGN; + #[allow(clippy::let_unit_value)] + let _ = Assert::::MIN_ALIGN_IS_NOT_A_POWER_OF_TWO; //FIXME: no longer needed + #[allow(clippy::let_unit_value)] + let _ = Assert::::BATCH_SIZE_IS_NOT_A_POWER_OF_TWO; //FIXME: not actually required + #[allow(clippy::let_unit_value)] + let _ = Assert::::MIN_ALIGN_IS_LESS_THAN_ALIGN_OF_T; //FIXME: no longer needed + + #[cfg(debug_assertions)] + debug_assert!(index + N < self.len); + #[cfg(debug_assertions)] + debug_assert_eq!(_len, self.len); + #[cfg(debug_assertions)] + debug_assert_eq!(index % N, 0); + + let off_ptr = self.ptr.as_ptr().add(index); + + //NOTE: ZSTs may cause this "slice" to point into nothingness. + //This sounds dangerous, but won't cause harm as nothing + //will actually access anything "in the slice" + + //TODO: when pointer_is_aligned is standardized, we can just use ptr::is_aligned() + #[cfg(debug_assertions)] + debug_assert_eq!(off_ptr as usize % MIN_ALIGN, 0); + + //SAFETY: off_ptr is not null + off_ptr as *const AlignedBatch + } + + /// Indexes the slice without doing bounds checks with a batch size of N. + /// + /// # Safety + /// `index` must be in-bounds. + /// `index` must be suitably aligned. + #[inline] + pub unsafe fn get_batch_aligned( + self, + index: usize, + len: usize, + ) -> &'a AlignedBatch + where + Align: Alignment, + { + &(*self.get_batch_aligned_raw(index, len)) + } +} + +impl<'a, T> ThinSimdAlignedSlicePtr<'a, UnsafeCell> { + /// Indexes the slice without doing bounds checks with a batch size of `N`. + /// The semantics are like `UnsafeCell` -- you must ensure the aliasing constraints are met. + /// + /// # Safety + /// `index` must be in-bounds. + /// `index` must be a multiple of `N`. + /// No other references exist to the batch of size `N` at `index` + #[inline] + pub unsafe fn get_batch_aligned_deref_mut( + self, + index: usize, + len: usize, + ) -> &'a mut AlignedBatch + where + Align: Alignment, + { + &mut *(self + .as_deref() + .get_batch_aligned_raw::(index, len) + as *mut AlignedBatch) + } + + /// Indexes the slice without doing bounds checks with a batch size of `N`. + /// The semantics are like `UnsafeCell` -- you must ensure the aliasing constraints are met. + /// + /// # Safety + /// `index` must be in-bounds. + /// `index` must be a multiple of `N`. + /// No mutable references exist to the batch of size `N` at `index` + #[inline] + pub unsafe fn get_batch_aligned_deref( + self, + index: usize, + len: usize, + ) -> &'a AlignedBatch + where + Align: Alignment, + { + &*(self + .as_deref() + .get_batch_aligned_raw::(index, len)) + } +} + +//Inspired from: https://github.com/rust-lang/rust/issues/57775#issuecomment-1098001375 +struct Assert { + _marker: PhantomData, +} + +impl Assert { + const YOUR_BATCH_SIZE_IS_NOT_A_MULTIPLE_OF_ALIGN: () = + assert!((N * core::mem::size_of::()) % MIN_ALIGN == 0); + const MIN_ALIGN_IS_NOT_A_POWER_OF_TWO: () = assert!(MIN_ALIGN.is_power_of_two()); + const BATCH_SIZE_IS_NOT_A_POWER_OF_TWO: () = assert!(N.is_power_of_two()); + const MIN_ALIGN_IS_LESS_THAN_ALIGN_OF_T: () = assert!(MIN_ALIGN >= core::mem::align_of::()); +} diff --git a/crates/bevy_ptr/src/lib.rs b/crates/bevy_ptr/src/lib.rs index 8f1f476a906e2b..553cd22f907743 100644 --- a/crates/bevy_ptr/src/lib.rs +++ b/crates/bevy_ptr/src/lib.rs @@ -2,8 +2,18 @@ #![no_std] #![warn(missing_docs)] +//TransparentWrapper is required for as_inner* conversions +//Re-exported from the [`bytemuck`] crate to ensure users can implement this trait for their own types +pub use bytemuck; + +//Re-exported the [`elain`] crate to ensure users can generically deal with alignments when using AlignedBatches +pub use elain; + use core::{cell::UnsafeCell, marker::PhantomData, mem::MaybeUninit, ptr::NonNull}; +/// Batch-related functionality +pub mod batch; + /// Type-erased borrow of some unknown type chosen when constructing this type. /// /// This type tries to act "borrow-like" which means that: @@ -125,6 +135,7 @@ impl<'a> Ptr<'a> { self.0.as_ptr() } } + impl_ptr!(PtrMut); impl<'a> PtrMut<'a> { /// Transforms this [`PtrMut`] into an [`OwningPtr`] @@ -155,6 +166,7 @@ impl<'a> PtrMut<'a> { self.0.as_ptr() } } + impl_ptr!(OwningPtr); impl<'a> OwningPtr<'a> { /// Consumes a value and creates an [`OwningPtr`] to it while ensuring a double drop does not happen. @@ -195,29 +207,77 @@ impl<'a> OwningPtr<'a> { } } -/// Conceptually equivalent to `&'a [T]` but with length information cut out for performance reasons -pub struct ThinSlicePtr<'a, T> { +/// Conceptually equivalent to `&'a [T]` but with length information cut out for performance reasons. +/// The slice itself is aligned to at least `MAX_SIMD_ALIGNMENT`, however its elements may not be. +/// Use the `align_to` method to get batches of suitable alignment. +pub struct ThinSimdAlignedSlicePtr<'a, T> { ptr: NonNull, #[cfg(debug_assertions)] len: usize, _marker: PhantomData<&'a [T]>, } -impl<'a, T> ThinSlicePtr<'a, T> { +impl<'a, T> ThinSimdAlignedSlicePtr<'a, T> { + /// # Safety + /// The contents of the slice returned by this function must never be accessed + #[inline] + pub unsafe fn dangling() -> Self { + let item_layout = core::alloc::Layout::new::(); + + let dangling = + NonNull::new(item_layout.align().max(batch::MAX_SIMD_ALIGNMENT) as *mut T).unwrap(); + + Self { + ptr: dangling, + #[cfg(debug_assertions)] + len: 0, + _marker: PhantomData, + } + } + + /// # Safety + /// `ptr` must be non-null + /// `ptr` must be aligned to at least `MAX_SIMD_ALIGNMENT` #[inline] + pub unsafe fn new(ptr: *mut T, _len: usize) -> Self { + //NOTE: ZSTs may cause this "slice" to point into nothingness. + //This sounds dangerous, but won't cause harm as nothing + //will actually access anything "in the slice" + + //TODO: when pointer_is_aligned is standardized, we can just use ptr::is_aligned() + #[cfg(debug_assertions)] + debug_assert_eq!(ptr as usize % batch::MAX_SIMD_ALIGNMENT, 0); + + Self { + ptr: NonNull::new_unchecked(ptr), + #[cfg(debug_assertions)] + len: _len, + _marker: PhantomData, + } + } + /// Indexes the slice without doing bounds checks /// /// # Safety /// `index` must be in-bounds. + #[inline] pub unsafe fn get(self, index: usize) -> &'a T { #[cfg(debug_assertions)] debug_assert!(index < self.len); &*self.ptr.as_ptr().add(index) } + + /// # Safety + /// `index` must be in bounds + /// `index + len` must be in bounds + #[inline] + pub unsafe fn get_slice(self, index: usize, len: usize) -> &'a [T] { + core::slice::from_raw_parts(self.ptr.as_ptr().add(index), len) + } } -impl<'a, T> Clone for ThinSlicePtr<'a, T> { +impl<'a, T> Clone for ThinSimdAlignedSlicePtr<'a, T> { fn clone(&self) -> Self { Self { ptr: self.ptr, @@ -228,16 +288,17 @@ impl<'a, T> Clone for ThinSlicePtr<'a, T> { } } -impl<'a, T> Copy for ThinSlicePtr<'a, T> {} +impl<'a, T> Copy for ThinSimdAlignedSlicePtr<'a, T> {} -impl<'a, T> From<&'a [T]> for ThinSlicePtr<'a, T> { +//Helpers for the UnsafeCell cases +impl<'a, T> ThinSimdAlignedSlicePtr<'a, UnsafeCell> { + /// Get an immutable view of this `ThinSimdAlignedSlicePtr`'s contents. Note that this is not a reference type. #[inline] - fn from(slice: &'a [T]) -> Self { - Self { - // SAFETY: a reference can never be null - ptr: unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut T) }, + pub fn as_deref(self) -> ThinSimdAlignedSlicePtr<'a, T> { + ThinSimdAlignedSlicePtr::<'a, T> { + ptr: self.ptr.cast::(), #[cfg(debug_assertions)] - len: slice.len(), + len: self.len, _marker: PhantomData, } }