From 31ed43c340ee006eba508fcfd3c8eb746970c88c Mon Sep 17 00:00:00 2001
From: Shane Peelar <lookatyouhacker@gmail.com>
Date: Sun, 6 Nov 2022 16:29:20 -0500
Subject: [PATCH] Implement batched query support

* Only Dense queries are accelerated currently
* Certain features are awaiting on GATs/generic const expressions
* Code refactored to use GATs
* `elain` crate now used to provide const generic alignments (PhantomData method)
* Code still requires a fixed alignment request across whole query (in progress)
* Batches support AsRef, AsMut, etc
* Simplified calling for_each_{mut_}batched (no longer need _ arguments)
* Add convenience functions for treating AlignedBatch16<f32, 4> as Vec4s
* Add a map API for creating projections of aligned batches
* Add a compile-time error in case a greater alignment is needed than the current batch alignment to satisfy T
* Documentation about SIMD and batching
* ALIGN now referred to as MIN_ALIGN for clarity
---
 crates/bevy_ecs/Cargo.toml                 |   1 +
 crates/bevy_ecs/src/archetype.rs           |   6 +-
 crates/bevy_ecs/src/change_detection.rs    | 145 ++++++++-
 crates/bevy_ecs/src/query/fetch.rs         | 285 +++++++++++++++-
 crates/bevy_ecs/src/query/filter.rs        |   4 +-
 crates/bevy_ecs/src/query/iter.rs          |   9 +-
 crates/bevy_ecs/src/query/mod.rs           | 112 +++++++
 crates/bevy_ecs/src/query/state.rs         | 215 +++++++++++-
 crates/bevy_ecs/src/storage/aligned_vec.rs | 258 +++++++++++++++
 crates/bevy_ecs/src/storage/blob_vec.rs    |  32 +-
 crates/bevy_ecs/src/storage/mod.rs         |   1 +
 crates/bevy_ecs/src/storage/sparse_set.rs  |  15 +-
 crates/bevy_ecs/src/storage/table.rs       |  36 ++-
 crates/bevy_ecs/src/system/query.rs        | 254 ++++++++++++++-
 crates/bevy_ecs/src/world/world_cell.rs    |   2 +-
 crates/bevy_ptr/Cargo.toml                 |   3 +
 crates/bevy_ptr/src/batch.rs               | 360 +++++++++++++++++++++
 crates/bevy_ptr/src/lib.rs                 |  83 ++++-
 18 files changed, 1757 insertions(+), 64 deletions(-)
 create mode 100644 crates/bevy_ecs/src/storage/aligned_vec.rs
 create mode 100644 crates/bevy_ptr/src/batch.rs
diff --git a/crates/bevy_ecs/Cargo.toml b/crates/bevy_ecs/Cargo.toml
index 29769f6e45e628..b0a1f132c64ed0 100644
--- a/crates/bevy_ecs/Cargo.toml
+++ b/crates/bevy_ecs/Cargo.toml
@@ -30,6 +30,7 @@ serde = { version = "1", features = ["derive"] }
 
 [dev-dependencies]
 rand = "0.8"
+bevy_math = { path = "../bevy_math", version = "0.9.0-dev" }
 
 [[example]]
 name = "events"
diff --git a/crates/bevy_ecs/src/archetype.rs b/crates/bevy_ecs/src/archetype.rs
index 0fc9271a42c6cf..c6eeb39bdccf33 100644
--- a/crates/bevy_ecs/src/archetype.rs
+++ b/crates/bevy_ecs/src/archetype.rs
@@ -5,7 +5,7 @@ use crate::{
     bundle::BundleId,
     component::{ComponentId, StorageType},
     entity::{Entity, EntityLocation},
-    storage::{SparseArray, SparseSet, SparseSetIndex, TableId},
+    storage::{aligned_vec::SimdAlignedVec, SparseArray, SparseSet, SparseSetIndex, TableId},
 };
 use std::{
     collections::HashMap,
@@ -181,7 +181,7 @@ pub struct Archetype {
     id: ArchetypeId,
     table_id: TableId,
     edges: Edges,
-    entities: Vec<ArchetypeEntity>,
+    entities: SimdAlignedVec<ArchetypeEntity>,
     table_components: Box<[ComponentId]>,
     sparse_set_components: Box<[ComponentId]>,
     components: SparseSet<ComponentId, ArchetypeComponentInfo>,
@@ -225,7 +225,7 @@ impl Archetype {
         Self {
             id,
             table_id,
-            entities: Vec::new(),
+            entities: SimdAlignedVec::new(),
             components,
             table_components,
             sparse_set_components,
diff --git a/crates/bevy_ecs/src/change_detection.rs b/crates/bevy_ecs/src/change_detection.rs
index b8d1f7c196d352..61114cf974b43c 100644
--- a/crates/bevy_ecs/src/change_detection.rs
+++ b/crates/bevy_ecs/src/change_detection.rs
@@ -1,7 +1,15 @@
 //! Types that detect when their internal data mutate.
 
+use crate::ptr::{
+    batch::AlignedBatch,
+    elain::{Align, Alignment},
+};
+
 use crate::{component::ComponentTicks, ptr::PtrMut, system::Resource};
-use std::ops::{Deref, DerefMut};
+use std::{
+    marker::PhantomData,
+    ops::{Deref, DerefMut},
+};
 
 /// The (arbitrarily chosen) minimum number of world tick increments between `check_tick` scans.
 ///
@@ -229,6 +237,15 @@ pub(crate) struct Ticks<'a> {
     pub(crate) change_tick: u32,
 }
 
+pub(crate) struct TicksBatch<'a, const N: usize, const MIN_ALIGN: usize>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    pub(crate) component_ticks: &'a mut AlignedBatch<ComponentTicks, N, MIN_ALIGN>,
+    pub(crate) last_change_tick: u32,
+    pub(crate) change_tick: u32,
+}
+
 /// Unique mutable borrow of a [`Resource`].
 ///
 /// See the [`Resource`] documentation for usage.
@@ -352,6 +369,132 @@ change_detection_impl!(Mut<'a, T>, T,);
 impl_methods!(Mut<'a, T>, T,);
 impl_debug!(Mut<'a, T>,);
 
+/// Unique mutable borrow of an entity's component (batched version).
+/// Each batch changes in unison:  a batch has changed if any of its elements have changed.
+pub struct MutBatch<'a, T, const N: usize, const MIN_ALIGN: usize>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    pub(crate) value: &'a mut AlignedBatch<T, N, MIN_ALIGN>,
+    pub(crate) ticks: TicksBatch<'a, N, MIN_ALIGN>,
+    pub(crate) _marker: PhantomData<T>,
+}
+
+impl<'a, T, const N: usize, const MIN_ALIGN: usize> DetectChanges for MutBatch<'a, T, N, MIN_ALIGN>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    #[inline]
+    fn is_added(&self) -> bool {
+        self.ticks
+            .component_ticks
+            .as_array()
+            .iter()
+            .any(|x| x.is_added(self.ticks.last_change_tick, self.ticks.change_tick))
+    }
+
+    #[inline]
+    fn is_changed(&self) -> bool {
+        self.ticks
+            .component_ticks
+            .as_array()
+            .iter()
+            .any(|x| x.is_changed(self.ticks.last_change_tick, self.ticks.change_tick))
+    }
+
+    #[inline]
+    fn set_changed(&mut self) {
+        for ticks in self.ticks.component_ticks.as_array_mut().iter_mut() {
+            ticks.set_changed(self.ticks.change_tick);
+        }
+    }
+
+    #[inline]
+    fn last_changed(&self) -> u32 {
+        self.ticks.last_change_tick
+    }
+
+    type Inner = AlignedBatch<T, N, MIN_ALIGN>;
+
+    fn set_last_changed(&mut self, last_change_tick: u32) {
+        self.ticks.last_change_tick = last_change_tick;
+    }
+
+    fn bypass_change_detection(&mut self) -> &mut Self::Inner {
+        self.value
+    }
+}
+
+impl<'a, T, const N: usize, const MIN_ALIGN: usize> Deref for MutBatch<'a, T, N, MIN_ALIGN>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    type Target = AlignedBatch<T, N, MIN_ALIGN>;
+
+    #[inline]
+    fn deref(&self) -> &Self::Target {
+        self.value
+    }
+}
+
+impl<'a, T, const N: usize, const MIN_ALIGN: usize> DerefMut for MutBatch<'a, T, N, MIN_ALIGN>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    #[inline]
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.set_changed();
+        self.value
+    }
+}
+
+impl<'a, T, const N: usize, const MIN_ALIGN: usize> AsRef<AlignedBatch<T, N, MIN_ALIGN>>
+    for MutBatch<'a, T, N, MIN_ALIGN>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    #[inline]
+    fn as_ref(&self) -> &AlignedBatch<T, N, MIN_ALIGN> {
+        self.deref()
+    }
+}
+
+impl<'a, T, const N: usize, const MIN_ALIGN: usize> AsMut<AlignedBatch<T, N, MIN_ALIGN>>
+    for MutBatch<'a, T, N, MIN_ALIGN>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    #[inline]
+    fn as_mut(&mut self) -> &mut AlignedBatch<T, N, MIN_ALIGN> {
+        self.deref_mut()
+    }
+}
+
+impl<'a, T, const N: usize, const MIN_ALIGN: usize> MutBatch<'a, T, N, MIN_ALIGN>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    /// Consume `self` and return a mutable reference to the
+    /// contained value while marking `self` as "changed".
+    #[inline]
+    pub fn into_inner(mut self) -> &'a mut AlignedBatch<T, N, MIN_ALIGN> {
+        self.set_changed();
+        self.value
+    }
+}
+
+impl<'a, T, const N: usize, const MIN_ALIGN: usize> std::fmt::Debug
+    for MutBatch<'a, T, N, MIN_ALIGN>
+where
+    Align<MIN_ALIGN>: Alignment,
+    AlignedBatch<T, N, MIN_ALIGN>: std::fmt::Debug,
+    T: std::fmt::Debug,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_tuple(stringify!($name)).field(&self.value).finish()
+    }
+}
+
 /// Unique mutable borrow of resources or an entity's component.
 ///
 /// Similar to [`Mut`], but not generic over the component type, instead
diff --git a/crates/bevy_ecs/src/query/fetch.rs b/crates/bevy_ecs/src/query/fetch.rs
index ad6598bcc6e0d6..81c93cd47acf9d 100644
--- a/crates/bevy_ecs/src/query/fetch.rs
+++ b/crates/bevy_ecs/src/query/fetch.rs
@@ -1,15 +1,20 @@
 use crate::{
     archetype::{Archetype, ArchetypeComponentId},
-    change_detection::Ticks,
+    change_detection::{MutBatch, Ticks, TicksBatch},
     component::{Component, ComponentId, ComponentStorage, ComponentTicks, StorageType},
     entity::Entity,
+    ptr::{
+        batch::AlignedBatch,
+        elain::{Align, Alignment},
+        ThinSimdAlignedSlicePtr, UnsafeCellDeref,
+    },
     query::{Access, DebugCheckedUnwrap, FilteredAccess},
     storage::{ComponentSparseSet, Table},
     world::{Mut, World},
 };
+
 use bevy_ecs_macros::all_tuples;
 pub use bevy_ecs_macros::WorldQuery;
-use bevy_ptr::{ThinSlicePtr, UnsafeCellDeref};
 use std::{cell::UnsafeCell, marker::PhantomData};
 
 /// Types that can be fetched from a [`World`] using a [`Query`].
@@ -426,6 +431,30 @@ pub unsafe trait WorldQuery {
     ) -> bool;
 }
 
+/// An extension of [`WorldQuery`] for batched queries.
+pub trait WorldQueryBatch: WorldQuery {
+    type FullBatch<'w, const N: usize, const MIN_ALIGN: usize>
+    where
+        Align<MIN_ALIGN>: Alignment;
+
+    /// Retrieve a batch of size `N` with desired alignment `ALIGN` from the current table.
+    /// # Safety
+    ///
+    /// `table_row_start` is a valid table row index for the current table
+    /// `table_row_start` + `N` is a valid table row index for the current table
+    /// `table_row_start` is a multiple of `N`
+    ///
+    /// Must always be called _after_ [`WorldQuery::set_table`].
+    unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>(
+        fetch: &mut <Self as WorldQuery>::Fetch<'w>,
+        entity_batch: &'w AlignedBatch<Entity, N, MIN_ALIGN>,
+        table_row_start: usize,
+        len: usize,
+    ) -> Self::FullBatch<'w, N, MIN_ALIGN>
+    where
+        Align<MIN_ALIGN>: Alignment;
+}
+
 /// A world query that is read only.
 ///
 /// # Safety
@@ -437,11 +466,17 @@ pub unsafe trait ReadOnlyWorldQuery: WorldQuery<ReadOnly = Self> {}
 pub type QueryFetch<'w, Q> = <Q as WorldQuery>::Fetch<'w>;
 /// The item type returned when a [`WorldQuery`] is iterated over
 pub type QueryItem<'w, Q> = <Q as WorldQuery>::Item<'w>;
+/// The item type returned when a [`WorldQuery`] is iterated over in a batched fashion
+pub type QueryBatch<'w, Q, const N: usize, const MIN_ALIGN: usize> =
+    <Q as WorldQueryBatch>::FullBatch<'w, N, MIN_ALIGN>;
 /// The read-only `Fetch` of a [`WorldQuery`], which is used to store state for each archetype/table.
 pub type ROQueryFetch<'w, Q> = QueryFetch<'w, <Q as WorldQuery>::ReadOnly>;
 /// The read-only variant of the item type returned when a [`WorldQuery`] is iterated over immutably
 pub type ROQueryItem<'w, Q> = QueryItem<'w, <Q as WorldQuery>::ReadOnly>;
 
+pub type ROQueryBatch<'w, Q, const N: usize, const MIN_ALIGN: usize> =
+    QueryBatch<'w, <Q as WorldQuery>::ReadOnly, N, MIN_ALIGN>;
+
 /// SAFETY: no component or archetype access
 unsafe impl WorldQuery for Entity {
     type Fetch<'w> = ();
@@ -508,13 +543,32 @@ unsafe impl WorldQuery for Entity {
     }
 }
 
+impl WorldQueryBatch for Entity {
+    type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> = &'w AlignedBatch<Entity, N, MIN_ALIGN>
+    where
+        Align<MIN_ALIGN>: Alignment;
+
+    #[inline]
+    unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>(
+        _fetch: &mut <Self as WorldQuery>::Fetch<'w>,
+        entity_batch: &'w AlignedBatch<Entity, N, MIN_ALIGN>,
+        _table_row_start: usize,
+        _len: usize,
+    ) -> <Self as WorldQueryBatch>::FullBatch<'w, N, MIN_ALIGN>
+    where
+        Align<MIN_ALIGN>: Alignment,
+    {
+        entity_batch
+    }
+}
+
 /// SAFETY: access is read only
 unsafe impl ReadOnlyWorldQuery for Entity {}
 
 #[doc(hidden)]
 pub struct ReadFetch<'w, T> {
     // T::Storage = TableStorage
-    table_components: Option<ThinSlicePtr<'w, UnsafeCell<T>>>,
+    table_components: Option<ThinSimdAlignedSlicePtr<'w, UnsafeCell<T>>>,
     // T::Storage = SparseStorage
     sparse_set: Option<&'w ComponentSparseSet>,
 }
@@ -587,7 +641,6 @@ unsafe impl<T: Component> WorldQuery for &T {
                 .get_column(component_id)
                 .debug_checked_unwrap()
                 .get_data_slice()
-                .into(),
         );
     }
 
@@ -649,16 +702,39 @@ unsafe impl<T: Component> WorldQuery for &T {
 /// SAFETY: access is read only
 unsafe impl<T: Component> ReadOnlyWorldQuery for &T {}
 
+impl<T: Component> WorldQueryBatch for &T {
+    type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> = &'w AlignedBatch<T, N, MIN_ALIGN> 
+    where
+        Align<MIN_ALIGN>: Alignment;
+
+    #[inline]
+    unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>(
+        fetch: &mut <Self as WorldQuery>::Fetch<'w>,
+        _entity_batch: &'w AlignedBatch<Entity, N, MIN_ALIGN>,
+        table_row_start: usize,
+        len: usize,
+    ) -> <Self as WorldQueryBatch>::FullBatch<'w, N, MIN_ALIGN>
+    where
+        Align<MIN_ALIGN>: Alignment,
+    {
+        //TODO: when generalized const expresions are stable, want the following:
+        //gcd::euclid_usize(ptr::MAX_SIMD_ALIGNMENT, N * core::mem::size_of::<T>());
+
+        let components = fetch.table_components.debug_checked_unwrap();
+
+        components.get_batch_aligned_deref::<N, MIN_ALIGN>(table_row_start, len)
+    }
+}
+
 #[doc(hidden)]
 pub struct WriteFetch<'w, T> {
     // T::Storage = TableStorage
     table_data: Option<(
-        ThinSlicePtr<'w, UnsafeCell<T>>,
-        ThinSlicePtr<'w, UnsafeCell<ComponentTicks>>,
+        ThinSimdAlignedSlicePtr<'w, UnsafeCell<T>>,
+        ThinSimdAlignedSlicePtr<'w, UnsafeCell<ComponentTicks>>,
     )>,
     // T::Storage = SparseStorage
     sparse_set: Option<&'w ComponentSparseSet>,
-
     last_change_tick: u32,
     change_tick: u32,
 }
@@ -732,8 +808,8 @@ unsafe impl<'__w, T: Component> WorldQuery for &'__w mut T {
     ) {
         let column = table.get_column(component_id).debug_checked_unwrap();
         fetch.table_data = Some((
-            column.get_data_slice().into(),
-            column.get_ticks_slice().into(),
+            column.get_data_slice(),
+            column.get_ticks_slice(),
         ));
     }
 
@@ -807,6 +883,44 @@ unsafe impl<'__w, T: Component> WorldQuery for &'__w mut T {
     }
 }
 
+impl<'__w, T: Component> WorldQueryBatch for &'__w mut T {
+    type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> = MutBatch<'w, T, N, MIN_ALIGN>
+    where
+        Align<MIN_ALIGN>: Alignment
+    ;
+
+    //FIXME:
+    /*
+    T: AlignedBatchGat<N, MIN_ALIGN>,
+    ComponentTicks: AlignedBatchGat<N, MIN_ALIGN>,
+    */
+
+    #[inline]
+    unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>(
+        fetch: &mut <Self as WorldQuery>::Fetch<'w>,
+        _entity_batch: &'w AlignedBatch<Entity, N, MIN_ALIGN>,
+        table_row_start: usize,
+        len: usize,
+    ) -> <Self as WorldQueryBatch>::FullBatch<'w, N, MIN_ALIGN>
+    where
+        Align<MIN_ALIGN>: Alignment,
+    {
+        let (table_components, table_ticks) = fetch.table_data.debug_checked_unwrap();
+
+        MutBatch::<T, N, MIN_ALIGN> {
+            value: table_components.get_batch_aligned_deref_mut::<N, MIN_ALIGN>(table_row_start, len),
+            ticks: TicksBatch {
+                // SAFETY: [table_row_start..+batch.len()] is in range
+                component_ticks: table_ticks
+                    .get_batch_aligned_deref_mut::<N, MIN_ALIGN>(table_row_start, len),
+                change_tick: fetch.change_tick,
+                last_change_tick: fetch.last_change_tick,
+            },
+            _marker: PhantomData,
+        }
+    }
+}
+
 #[doc(hidden)]
 pub struct OptionFetch<'w, T: WorldQuery> {
     fetch: T::Fetch<'w>,
@@ -911,6 +1025,34 @@ unsafe impl<T: WorldQuery> WorldQuery for Option<T> {
     }
 }
 
+impl<T: WorldQueryBatch> WorldQueryBatch for Option<T> {
+    type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> = Option<QueryBatch<'w, T, N, MIN_ALIGN>>   
+    where
+        Align<MIN_ALIGN>: Alignment;
+
+    #[inline]
+    unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>(
+        fetch: &mut <Self as WorldQuery>::Fetch<'w>,
+        entity_batch: &'w AlignedBatch<Entity, N, MIN_ALIGN>,
+        table_row_start: usize,
+        len: usize,
+    ) -> <Self as WorldQueryBatch>::FullBatch<'w, N, MIN_ALIGN>
+    where
+        Align<MIN_ALIGN>: Alignment,
+    {
+        if fetch.matches {
+            Some(T::fetch_batched(
+                &mut fetch.fetch,
+                entity_batch,
+                table_row_start,
+                len,
+            ))
+        } else {
+            None
+        }
+    }
+}
+
 /// SAFETY: [`OptionFetch`] is read only because `T` is read only
 unsafe impl<T: ReadOnlyWorldQuery> ReadOnlyWorldQuery for Option<T> {}
 
@@ -989,10 +1131,46 @@ impl<T: Component> ChangeTrackers<T> {
     }
 }
 
+/// A batch of [`ChangeTrackers`].  This is used when performing queries with Change Trackers using the
+/// [`Query::for_each_mut_batched`](crate::system::Query::for_each_mut_batched) and [`Query::for_each_batched`](crate::system::Query::for_each_batched) functions.
+#[derive(Clone)]
+pub struct ChangeTrackersBatch<'a, T, const N: usize, const MIN_ALIGN: usize>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    pub(crate) component_ticks: &'a AlignedBatch<ComponentTicks, N, MIN_ALIGN>,
+    pub(crate) last_change_tick: u32,
+    pub(crate) change_tick: u32,
+    marker: PhantomData<T>,
+}
+
+impl<'a, T: Component, const N: usize, const MIN_ALIGN: usize> ChangeTrackersBatch<'a, T, N, MIN_ALIGN>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    /// Returns true if this component has been added since the last execution of this system.
+    #[inline]
+    pub fn is_added(&self) -> bool {
+        self.component_ticks
+            .as_array()
+            .iter()
+            .any(|x| x.is_added(self.last_change_tick, self.change_tick))
+    }
+
+    /// Returns true if this component has been changed since the last execution of this system.
+    #[inline]
+    pub fn is_changed(&self) -> bool {
+        self.component_ticks
+            .as_array()
+            .iter()
+            .any(|x| x.is_changed(self.last_change_tick, self.change_tick))
+    }
+}
+
 #[doc(hidden)]
 pub struct ChangeTrackersFetch<'w, T> {
     // T::Storage = TableStorage
-    table_ticks: Option<ThinSlicePtr<'w, UnsafeCell<ComponentTicks>>>,
+    table_ticks: Option<ThinSimdAlignedSlicePtr<'w, UnsafeCell<ComponentTicks>>>,
     // T::Storage = SparseStorage
     sparse_set: Option<&'w ComponentSparseSet>,
 
@@ -1075,7 +1253,6 @@ unsafe impl<T: Component> WorldQuery for ChangeTrackers<T> {
                 .get_column(id)
                 .debug_checked_unwrap()
                 .get_ticks_slice()
-                .into(),
         );
     }
 
@@ -1140,6 +1317,35 @@ unsafe impl<T: Component> WorldQuery for ChangeTrackers<T> {
     }
 }
 
+impl<T: Component> WorldQueryBatch for ChangeTrackers<T> {
+    //FIXME: ComponentTicks: AlignedBatchGat<N, MIN_ALIGN>,
+    type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> = ChangeTrackersBatch<'w, T, N, MIN_ALIGN>
+    where
+        Align<MIN_ALIGN>: Alignment;
+
+    #[inline]
+    unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>(
+        fetch: &mut <Self as WorldQuery>::Fetch<'w>,
+        _entity_batch: &'w AlignedBatch<Entity, N, MIN_ALIGN>,
+        table_row_start: usize,
+        len: usize,
+    ) -> Self::FullBatch<'w, N, MIN_ALIGN>
+    where
+        Align<MIN_ALIGN>: Alignment,
+    {
+        ChangeTrackersBatch {
+            component_ticks: {
+                let table_ticks = fetch.table_ticks.debug_checked_unwrap();
+
+                table_ticks.get_batch_aligned_deref::<N, MIN_ALIGN>(table_row_start, len)
+            },
+            marker: PhantomData,
+            last_change_tick: fetch.last_change_tick,
+            change_tick: fetch.change_tick,
+        }
+    }
+}
+
 /// SAFETY: access is read only
 unsafe impl<T: Component> ReadOnlyWorldQuery for ChangeTrackers<T> {}
 
@@ -1239,9 +1445,34 @@ macro_rules! impl_tuple_fetch {
             }
         }
 
+        //FIXME: the magic can happen here with different ALIGNments!!!
+        #[allow(unused_variables)]
+        #[allow(non_snake_case)]
+        #[allow(clippy::unused_unit)]
+        impl< $($name: WorldQueryBatch),*> WorldQueryBatch for ($($name,)*)
+        {
+            type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> = ($($name::FullBatch<'w, N, MIN_ALIGN>,)*)
+            where
+                Align<MIN_ALIGN>: Alignment
+            ;
+
+            #[inline]
+            unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>(
+                _fetch: &mut <Self as WorldQuery>::Fetch<'w>,
+                _entity_batch: &'w AlignedBatch<Entity, N, MIN_ALIGN>,
+                _table_row_start: usize,
+                _len: usize,
+            ) -> <Self as WorldQueryBatch>::FullBatch<'w, N, MIN_ALIGN>
+                where
+                    Align<MIN_ALIGN>: Alignment
+            {
+                let ($($name,)*) = _fetch;
+                    ($($name::fetch_batched($name, _entity_batch, _table_row_start, _len),)*)
+            }
+        }
+
         /// SAFETY: each item in the tuple is read only
         unsafe impl<$($name: ReadOnlyWorldQuery),*> ReadOnlyWorldQuery for ($($name,)*) {}
-
     };
 }
 
@@ -1384,6 +1615,36 @@ macro_rules! impl_anytuple_fetch {
         /// SAFETY: each item in the tuple is read only
         unsafe impl<$($name: ReadOnlyWorldQuery),*> ReadOnlyWorldQuery for AnyOf<($($name,)*)> {}
 
+        //FIXME: magic can happen here, too, for different ALIGNments
+        #[allow(unused_variables)]
+        #[allow(non_snake_case)]
+        #[allow(clippy::unused_unit)]
+        impl<$($name: WorldQueryBatch),*> WorldQueryBatch for AnyOf<($($name,)*)>
+        {
+            type FullBatch<'w, const N: usize, const MIN_ALIGN: usize> = ($(Option<$name::FullBatch<'w, N, MIN_ALIGN>>,)*)
+            where
+                Align<MIN_ALIGN>: Alignment;
+
+
+            #[inline]
+            unsafe fn fetch_batched<'w, const N: usize, const MIN_ALIGN: usize>(
+                _fetch: &mut <Self as WorldQuery>::Fetch<'w>,
+                _entity_batch: &'w AlignedBatch<Entity, N, MIN_ALIGN>,
+                _table_row_start: usize,
+                _len: usize,
+            ) -> <Self as WorldQueryBatch>::FullBatch<'w, N, MIN_ALIGN>
+            where
+                Align<MIN_ALIGN>: Alignment
+            {
+                let ($($name,)*) = _fetch;
+
+                ($(
+                    $name.1.then(|| $name::fetch_batched(&mut $name.0, _entity_batch, _table_row_start, _len)),
+                )*)
+
+            }
+        }
+
     };
 }
 
diff --git a/crates/bevy_ecs/src/query/filter.rs b/crates/bevy_ecs/src/query/filter.rs
index 2cf2fc3e1c10ba..bc1d738cd46359 100644
--- a/crates/bevy_ecs/src/query/filter.rs
+++ b/crates/bevy_ecs/src/query/filter.rs
@@ -2,12 +2,12 @@ use crate::{
     archetype::{Archetype, ArchetypeComponentId},
     component::{Component, ComponentId, ComponentStorage, ComponentTicks, StorageType},
     entity::Entity,
+    ptr::{ThinSimdAlignedSlicePtr, UnsafeCellDeref},
     query::{Access, DebugCheckedUnwrap, FilteredAccess, WorldQuery},
     storage::{ComponentSparseSet, Table},
     world::World,
 };
 use bevy_ecs_macros::all_tuples;
-use bevy_ptr::{ThinSlicePtr, UnsafeCellDeref};
 use std::{cell::UnsafeCell, marker::PhantomData};
 
 use super::ReadOnlyWorldQuery;
@@ -413,7 +413,7 @@ macro_rules! impl_tick_filter {
         #[doc(hidden)]
         $(#[$fetch_meta])*
         pub struct $fetch_name<'w, T> {
-            table_ticks: Option<ThinSlicePtr<'w, UnsafeCell<ComponentTicks>>>,
+            table_ticks: Option<ThinSimdAlignedSlicePtr<'w, UnsafeCell<ComponentTicks>>>,
             marker: PhantomData<T>,
             sparse_set: Option<&'w ComponentSparseSet>,
             last_change_tick: u32,
diff --git a/crates/bevy_ecs/src/query/iter.rs b/crates/bevy_ecs/src/query/iter.rs
index 8924c54829a2d5..115a59ced88bba 100644
--- a/crates/bevy_ecs/src/query/iter.rs
+++ b/crates/bevy_ecs/src/query/iter.rs
@@ -2,6 +2,7 @@ use crate::{
     archetype::{ArchetypeEntity, ArchetypeId, Archetypes},
     entity::{Entities, Entity},
     prelude::World,
+    ptr::ThinSimdAlignedSlicePtr,
     query::{ArchetypeFilter, DebugCheckedUnwrap, QueryState, WorldQuery},
     storage::{TableId, Tables},
 };
@@ -469,7 +470,7 @@ impl<'w, 's, Q: ReadOnlyWorldQuery, F: ReadOnlyWorldQuery, const K: usize> Fused
 struct QueryIterationCursor<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> {
     table_id_iter: std::slice::Iter<'s, TableId>,
     archetype_id_iter: std::slice::Iter<'s, ArchetypeId>,
-    table_entities: &'w [Entity],
+    table_entities: ThinSimdAlignedSlicePtr<'w, Entity>,
     archetype_entities: &'w [ArchetypeEntity],
     fetch: Q::Fetch<'w>,
     filter: F::Fetch<'w>,
@@ -540,7 +541,7 @@ impl<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> QueryIterationCursor<'w, 's,
         QueryIterationCursor {
             fetch,
             filter,
-            table_entities: &[],
+            table_entities: ThinSimdAlignedSlicePtr::dangling(),
             archetype_entities: &[],
             table_id_iter: query_state.matched_table_ids.iter(),
             archetype_id_iter: query_state.matched_archetype_ids.iter(),
@@ -556,7 +557,7 @@ impl<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> QueryIterationCursor<'w, 's,
         if self.current_index > 0 {
             let index = self.current_index - 1;
             if Self::IS_DENSE {
-                let entity = self.table_entities.get_unchecked(index);
+                let entity = self.table_entities.get(index);
                 Some(Q::fetch(&mut self.fetch, *entity, index))
             } else {
                 let archetype_entity = self.archetype_entities.get_unchecked(index);
@@ -602,7 +603,7 @@ impl<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> QueryIterationCursor<'w, 's,
 
                 // SAFETY: set_table was called prior.
                 // `current_index` is a table row in range of the current table, because if it was not, then the if above would have been executed.
-                let entity = self.table_entities.get_unchecked(self.current_index);
+                let entity = self.table_entities.get(self.current_index);
                 if !F::filter_fetch(&mut self.filter, *entity, self.current_index) {
                     self.current_index += 1;
                     continue;
diff --git a/crates/bevy_ecs/src/query/mod.rs b/crates/bevy_ecs/src/query/mod.rs
index ac0767f4c08c3e..1f7f6a546d9d44 100644
--- a/crates/bevy_ecs/src/query/mod.rs
+++ b/crates/bevy_ecs/src/query/mod.rs
@@ -75,6 +75,9 @@ mod tests {
     #[derive(Component, Debug, Eq, PartialEq, Clone, Copy)]
     struct D(usize);
 
+    #[derive(Component)]
+    struct E;
+
     #[derive(Component, Debug, Eq, PartialEq, Clone, Copy)]
     #[component(storage = "SparseSet")]
     struct Sparse(usize);
@@ -783,6 +786,115 @@ mod tests {
         }
     }
 
+    #[test]
+    fn batched_queries() {
+        let mut world = World::new();
+
+        world.spawn_batch(
+            (0..127)
+                .into_iter()
+                .map(|i| (A(4 * i), B(4 * i + 1), C(4 * i + 2), D(4 * i + 3))),
+        );
+
+        fn system_compute(mut q: Query<(&mut A, &B, &C, &D)>) {
+            let mut scalar_counter = 0;
+            let mut batch_counter = 0;
+
+            q.for_each_mut_batched::<4, 16>(
+                |(mut a, b, c, d)| {
+                    assert_eq!(a.ticks.component_ticks.added, 1);
+                    assert_eq!(a.ticks.component_ticks.changed, 1);
+
+                    a.0 += b.0 + c.0 + d.0;
+                    scalar_counter += 1;
+
+                    assert_eq!(a.ticks.component_ticks.added, 1);
+                    assert_eq!(a.ticks.component_ticks.changed, 2);
+                },
+                |(mut a, b, c, d)| {
+                    for ticks in a.ticks.component_ticks.as_array().iter() {
+                        assert_eq!(ticks.added, 1);
+                        assert_eq!(ticks.changed, 1);
+                    }
+
+                    assert_eq!(
+                        *a.as_array(),
+                        [
+                            A(4 * batch_counter),
+                            A(4 * (batch_counter + 1)),
+                            A(4 * (batch_counter + 2)),
+                            A(4 * (batch_counter + 3))
+                        ]
+                    );
+
+                    for (i, mut a_elem) in a.as_array_mut().iter_mut().enumerate() {
+                        a_elem.0 += b.as_array()[i].0 + c.as_array()[i].0 + d.as_array()[i].0;
+                    }
+
+                    for ticks in a.ticks.component_ticks.as_array().iter() {
+                        assert_eq!(ticks.added, 1);
+                        assert_eq!(ticks.changed, 2);
+                    }
+
+                    batch_counter += 4;
+                },
+            );
+
+            assert_eq!(scalar_counter, 3);
+            assert_eq!(batch_counter, 124);
+        }
+        fn system_check(mut q: Query<&A>) {
+            let mut scalar_counter = 0;
+            let mut batch_counter = 0;
+
+            q.for_each_batched::<4, 16>(
+                |a| {
+                    assert_eq!(*a, A(1990 + 16 * scalar_counter));
+
+                    scalar_counter += 1;
+                },
+                |a| {
+                    assert_eq!(
+                        *a.as_array(),
+                        [
+                            A(16 * batch_counter + 6),
+                            A(16 * (batch_counter + 1) + 6),
+                            A(16 * (batch_counter + 2) + 6),
+                            A(16 * (batch_counter + 3) + 6)
+                        ]
+                    );
+
+                    batch_counter += 4;
+                },
+            );
+        }
+
+        world.increment_change_tick();
+
+        let mut system_compute = IntoSystem::into_system(system_compute);
+        system_compute.initialize(&mut world);
+        system_compute.run((), &mut world);
+
+        let mut system_check = IntoSystem::into_system(system_check);
+        system_check.initialize(&mut world);
+        system_check.run((), &mut world);
+    }
+
+    #[test]
+    fn batched_queries_zst() {
+        let mut world = World::new();
+
+        world.spawn_batch((0..127).into_iter().map(|_| E));
+
+        fn system_compute(mut q: Query<&mut E>) {
+            q.for_each_mut_batched::<4, 16>(|mut e| *e = E, |mut e| e.as_array_mut()[1] = E);
+        }
+
+        let mut system_compute = IntoSystem::into_system(system_compute);
+        system_compute.initialize(&mut world);
+        system_compute.run((), &mut world);
+    }
+
     #[test]
     fn mut_to_immut_query_methods_have_immut_item() {
         #[derive(Component)]
diff --git a/crates/bevy_ecs/src/query/state.rs b/crates/bevy_ecs/src/query/state.rs
index 18874fd107fae5..1ab602e83f5fee 100644
--- a/crates/bevy_ecs/src/query/state.rs
+++ b/crates/bevy_ecs/src/query/state.rs
@@ -3,19 +3,24 @@ use crate::{
     component::ComponentId,
     entity::Entity,
     prelude::FromWorld,
+    ptr::elain::{Align, Alignment},
     query::{
         Access, DebugCheckedUnwrap, FilteredAccess, QueryCombinationIter, QueryIter, WorldQuery,
     },
     storage::TableId,
     world::{World, WorldId},
 };
+use bevy_ptr::ThinSimdAlignedSlicePtr;
 use bevy_tasks::ComputeTaskPool;
 #[cfg(feature = "trace")]
 use bevy_utils::tracing::Instrument;
 use fixedbitset::FixedBitSet;
 use std::{borrow::Borrow, fmt, mem::MaybeUninit};
 
-use super::{NopWorldQuery, QueryManyIter, ROQueryItem, ReadOnlyWorldQuery};
+use super::{
+    NopWorldQuery, QueryBatch, QueryItem, QueryManyIter, ROQueryBatch, ROQueryItem,
+    ReadOnlyWorldQuery, WorldQueryBatch,
+};
 
 /// Provides scoped access to a [`World`] state according to a given [`WorldQuery`] and query filter.
 #[repr(C)]
@@ -792,6 +797,30 @@ impl<Q: WorldQuery, F: ReadOnlyWorldQuery> QueryState<Q, F> {
         }
     }
 
+    /// A read-only version of [`for_each_mut_batched`](Self::for_each_mut_batched).  Detailed docs can be found there regarding how to use this function.
+    #[inline]
+    pub fn for_each_batched<'w, const N: usize, const MIN_ALIGN: usize>(
+        &'w mut self,
+        world: &'w mut World,
+        func: impl FnMut(ROQueryItem<'w, Q>),
+        func_batch: impl FnMut(ROQueryBatch<'w, Q, N, MIN_ALIGN>),
+    ) where
+        <Q as WorldQuery>::ReadOnly: WorldQueryBatch,
+        Align<MIN_ALIGN>: Alignment,
+    {
+        // SAFETY: query has unique world access
+        unsafe {
+            self.update_archetypes(world);
+            self.as_readonly().for_each_unchecked_manual_batched(
+                world,
+                func,
+                func_batch,
+                world.last_change_tick(),
+                world.read_change_tick(),
+            );
+        }
+    }
+
     /// Runs `func` on each query result for the given [`World`]. This is faster than the equivalent
     /// `iter_mut()` method, but cannot be chained like a normal [`Iterator`].
     #[inline]
@@ -807,6 +836,46 @@ impl<Q: WorldQuery, F: ReadOnlyWorldQuery> QueryState<Q, F> {
             );
         }
     }
+    /// This is a batched version of [`for_each_mut`](Self::for_each_mut) that accepts a batch size `N` together with a desired alignment for the batches `ALIGN`.
+    /// The advantage of using batching in queries is that it enables SIMD acceleration of your code to help you meet your performance goals.
+    /// This function accepts two arguments, `func`, and `func_batch` which represent the "scalar" and "vector" (or "batched") paths of your code respectively.
+    ///
+    /// ## Usage:
+    ///
+    /// * Supported values of `ALIGN` are 16, 32, and 64
+    /// * `N` must be a power of 2
+    /// * `func_batch` receives components in "batches" that are aligned to `ALIGN`.
+    /// * `func` functions exactly as does in [`for_each_mut`](Self::for_each_mut) -- it receives "scalar" (non-batched) components.
+    ///
+    /// In other words, `func_batch` composes the "fast path" of your query, and `func` is the "slow path".
+    ///
+    /// Batches are currently one of [`AlignedBatch16`](bevy_ptr::batch::AlignedBatch16), [`AlignedBatch32`](bevy_ptr::batch::AlignedBatch32),
+    /// or [`AlignedBatch64`](bevy_ptr::batch::AlignedBatch64) types, each corresponding to a guaranteed batch alignment.
+    /// The batch alignment is important as it enables architecture-specific optimizations that depend on alignment.
+    ///
+    /// See [`Query::for_each_mut_batched`](crate::system::Query::for_each_mut_batched) for a complete example of how to use this function.
+    #[inline]
+    pub fn for_each_mut_batched<'w, const N: usize, const MIN_ALIGN: usize>(
+        &'w mut self,
+        world: &'w mut World,
+        func: impl FnMut(QueryItem<'w, Q>),
+        func_batch: impl FnMut(QueryBatch<'w, Q, N, MIN_ALIGN>),
+    ) where
+        Q: WorldQueryBatch,
+        Align<MIN_ALIGN>: Alignment,
+    {
+        // SAFETY: query has unique world access
+        unsafe {
+            self.update_archetypes(world);
+            self.for_each_unchecked_manual_batched(
+                world,
+                func,
+                func_batch,
+                world.last_change_tick(),
+                world.read_change_tick(),
+            );
+        }
+    }
 
     /// Runs `func` on each query result for the given [`World`]. This is faster than the equivalent
     /// iter() method, but cannot be chained like a normal [`Iterator`].
@@ -945,7 +1014,7 @@ impl<Q: WorldQuery, F: ReadOnlyWorldQuery> QueryState<Q, F> {
 
                 let entities = table.entities();
                 for row in 0..table.entity_count() {
-                    let entity = entities.get_unchecked(row);
+                    let entity = entities.get(row);
                     if !F::filter_fetch(&mut filter, *entity, row) {
                         continue;
                     }
@@ -980,6 +1049,146 @@ impl<Q: WorldQuery, F: ReadOnlyWorldQuery> QueryState<Q, F> {
         }
     }
 
+    //TODO: allow differing batch alignments... right now everything is forced to `ALIGN`, but
+    //it is actually possible to offer batches of different components at different alignments
+    //when generalized const expresions are stable.  I.e,the following:
+    //      gcd::euclid_usize(crate::ptr::batch::MAX_SIMD_ALIGNMENT, N * core::mem::size_of::<T>());
+    pub(crate) unsafe fn for_each_unchecked_manual_batched<
+        'w,
+        const N: usize,
+        const MIN_ALIGN: usize,
+        FN: FnMut(QueryItem<'w, Q>),
+        FnBatch: FnMut(QueryBatch<'w, Q, N, MIN_ALIGN>),
+    >(
+        &self,
+        world: &'w World,
+        mut func: FN,
+        mut func_batch: FnBatch,
+        last_change_tick: u32,
+        change_tick: u32,
+    ) where
+        Q: WorldQueryBatch,
+        Align<MIN_ALIGN>: Alignment,
+    {
+        // NOTE: If you are changing query iteration code, remember to update the following places, where relevant:
+        // QueryIter, QueryIterationCursor, QueryManyIter, QueryCombinationIter, QueryState::for_each_unchecked_manual, QueryState::par_for_each_unchecked_manual
+        let mut fetch = Q::init_fetch(world, &self.fetch_state, last_change_tick, change_tick);
+        let mut filter = F::init_fetch(world, &self.filter_state, last_change_tick, change_tick);
+
+        //Can't use this because it captures a mutable reference to fetch and filter
+
+        let serial_portion = |entities: ThinSimdAlignedSlicePtr<'w, Entity>,
+                              fetch: &mut Q::Fetch<'w>,
+                              filter: &mut F::Fetch<'w>,
+                              func: &mut FN,
+                              range| {
+            for table_index in range {
+                let entity = entities.get(table_index);
+                if !F::filter_fetch(filter, *entity, table_index) {
+                    continue;
+                }
+                let item = Q::fetch(fetch, *entity, table_index);
+                func(item);
+            }
+        };
+
+        let tables = &world.storages().tables;
+        if Q::IS_DENSE && F::IS_DENSE {
+            for table_id in &self.matched_table_ids {
+                let table = &tables[*table_id];
+                let entities = table.entities();
+                Q::set_table(&mut fetch, &self.fetch_state, table);
+                F::set_table(&mut filter, &self.filter_state, table);
+
+                let mut table_index = 0;
+
+                //ALIGNED PORTION:
+
+                //table_index = prologue_end;
+
+                let batch_end = table.batchable_region_end::<N>();
+
+                while table_index < batch_end {
+                    //TODO PERF: since both the Query and the Filter are dense, can this be precomputed?
+                    //NOTE: if F = (), this optimizes right out, so don't worry about performance in that case.
+                    let mut unbatchable = None;
+                    for i in 0..N {
+                        let table_row = table_index + i;
+                        let entity = entities.get(table_row);
+
+                        if !F::filter_fetch(&mut filter, *entity, table_row) {
+                            //Cannot do a full batch, fallback to scalar.
+                            //Already checked the filter against everything up until now.
+                            //Therefore, do an *unchecked* serial portion.
+                            for p in table_index..table_row {
+                                let entity = entities.get(p);
+                                let item = Q::fetch(&mut fetch, *entity, p);
+                                func(item);
+                            }
+
+                            //Handle the rest after
+                            unbatchable = Some(table_row..table_index + N);
+                            break;
+                        }
+                    }
+
+                    if let Some(rest) = unbatchable {
+                        serial_portion(entities, &mut fetch, &mut filter, &mut func, rest);
+                    } else {
+                        //TODO: assume likely/hot path
+                        let aligned_entity_batch =
+                            entities.get_batch_aligned(table_index, table.entity_count());
+
+                        let batch = Q::fetch_batched(
+                            &mut fetch,
+                            aligned_entity_batch,
+                            table_index,
+                            table.entity_count(),
+                        );
+                        func_batch(batch);
+                    }
+
+                    table_index += N;
+                }
+
+                //EPILOGUE:
+                serial_portion(
+                    entities,
+                    &mut fetch,
+                    &mut filter,
+                    &mut func,
+                    batch_end..table.entity_count(),
+                );
+            }
+        } else {
+            //TODO: accelerate with batching, but first need to figure out if it's worth trying to batch sparse queries
+            let archetypes = &world.archetypes;
+            for archetype_id in &self.matched_archetype_ids {
+                let archetype = archetypes.get(*archetype_id).debug_checked_unwrap();
+                let table = tables.get(archetype.table_id()).debug_checked_unwrap();
+                Q::set_archetype(&mut fetch, &self.fetch_state, archetype, table);
+                F::set_archetype(&mut filter, &self.filter_state, archetype, table);
+
+                let entities = archetype.entities();
+                for idx in 0..archetype.len() {
+                    let archetype_entity = entities.get_unchecked(idx);
+                    if !F::filter_fetch(
+                        &mut filter,
+                        archetype_entity.entity,
+                        archetype_entity.table_row,
+                    ) {
+                        continue;
+                    }
+                    func(Q::fetch(
+                        &mut fetch,
+                        archetype_entity.entity,
+                        archetype_entity.table_row,
+                    ));
+                }
+            }
+        }
+    }
+
     /// Runs `func` on each query result in parallel for the given [`World`], where the last change and
     /// the current change tick are given. This is faster than the equivalent
     /// iter() method, but cannot be chained like a normal [`Iterator`].
@@ -1039,7 +1248,7 @@ impl<Q: WorldQuery, F: ReadOnlyWorldQuery> QueryState<Q, F> {
                             Q::set_table(&mut fetch, &self.fetch_state, table);
                             F::set_table(&mut filter, &self.filter_state, table);
                             for row in offset..offset + len {
-                                let entity = entities.get_unchecked(row);
+                                let entity = entities.get(row);
                                 if !F::filter_fetch(&mut filter, *entity, row) {
                                     continue;
                                 }
diff --git a/crates/bevy_ecs/src/storage/aligned_vec.rs b/crates/bevy_ecs/src/storage/aligned_vec.rs
new file mode 100644
index 00000000000000..fee0a0d9b14beb
--- /dev/null
+++ b/crates/bevy_ecs/src/storage/aligned_vec.rs
@@ -0,0 +1,258 @@
+use core::alloc::Layout;
+use core::borrow::{Borrow, BorrowMut};
+use core::marker::PhantomData;
+use core::mem::needs_drop;
+use core::ops::{Deref, DerefMut};
+
+use core::cmp;
+use core::slice::SliceIndex;
+
+use bevy_ptr::{OwningPtr, ThinSimdAlignedSlicePtr};
+
+use super::blob_vec::BlobVec;
+
+/// A vector whose internal buffer is aligned to `MAX_SIMD_ALIGNMENT`.
+/// Intended to support SIMD use cases.
+///
+/// Used to densely store homogeneous ECS data whose type is known at compile time.
+/// Built on `BlobVec`. It is not intended to be a drop-in replacement for Vec at this time.
+
+/*
+NOTE: AlignedVec is ONLY implemented in terms of BlobVec because the Allocator API is not stable yet.
+Once the Allocator API is stable, one could easily define AlignedVec as being a Vec with an allocator
+that provides MAX_SIMD_ALIGNMENT as a guarantee, and remove almost all of the code in this file:
+
+    type AlignedVec<T> = Vec<T,AlignedAllocator>;
+
+As it stands, AlignedVec is a stand-in to provide just enough functionality to work for bevy_ecs.
+*/
+pub(crate) struct SimdAlignedVec<T> {
+    vec: BlobVec,
+    _marker: PhantomData<T>,
+}
+
+impl<T> Default for SimdAlignedVec<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<T: std::fmt::Debug> std::fmt::Debug for SimdAlignedVec<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("AlignedVec")
+            .field("vec", &self.vec)
+            .finish()
+    }
+}
+
+impl<T> SimdAlignedVec<T> {
+    // SAFETY: The pointer points to a valid value of type `T` and it is safe to drop this value.
+    unsafe fn drop_ptr(x: OwningPtr<'_>) {
+        x.drop_as::<T>();
+    }
+
+    pub fn with_capacity(capacity: usize) -> SimdAlignedVec<T> {
+        Self {
+            // SAFETY:
+            // `drop` accurately reflects whether the contents of this Vec need to be dropped, and correctly performs the drop operation.
+            vec: unsafe {
+                BlobVec::new(
+                    Layout::new::<T>(),
+                    needs_drop::<T>().then_some(Self::drop_ptr as _),
+                    capacity,
+                )
+            },
+            _marker: PhantomData,
+        }
+    }
+
+    pub fn new() -> SimdAlignedVec<T> {
+        Self::with_capacity(0) //Ensure a starting power-of-two capacity (for non-ZSTs)
+    }
+
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.vec.len()
+    }
+
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.vec.len() == 0
+    }
+
+    #[inline]
+    pub fn capacity(&self) -> usize {
+        self.vec.capacity()
+    }
+
+    /// # Safety
+    /// It is the caller's responsibility to ensure that `index` is < self.len()
+    #[inline]
+    pub unsafe fn get_unchecked(&self, index: usize) -> &<usize as SliceIndex<[T]>>::Output {
+        debug_assert!(index < self.len());
+
+        self.vec.get_unchecked(index).deref()
+    }
+
+    /// # Safety
+    /// It is the caller's responsibility to ensure that `index` is < self.len()
+    #[inline]
+    pub unsafe fn get_unchecked_mut(
+        &mut self,
+        index: usize,
+    ) -> &mut <usize as SliceIndex<[T]>>::Output {
+        debug_assert!(index < self.len());
+
+        self.vec.get_unchecked_mut(index).deref_mut()
+    }
+
+    //This function attempts to keep the same semantics as Vec's swap_remove function
+    pub fn swap_remove(&mut self, index: usize) -> T {
+        #[cold]
+        #[inline(never)]
+        fn assert_failed(index: usize, len: usize) -> ! {
+            panic!("swap_remove index (is {index}) should be < len (is {len})");
+        }
+        let len = self.len();
+        if index >= len {
+            assert_failed(index, len);
+        }
+
+        // SAFETY:
+        // The index is guaranteed to be in bounds by this point.
+        unsafe { self.vec.swap_remove_and_forget_unchecked(index).read() }
+    }
+
+    pub fn push(&mut self, value: T) {
+        // SAFETY:
+        // value is a valid owned instance of T, therefore it is safe to call push with it
+        OwningPtr::make(value, |ptr| unsafe {
+            self.vec.push(ptr);
+        });
+    }
+
+    pub fn reserve_exact(&mut self, additional: usize) {
+        self.vec.reserve_exact(additional);
+    }
+
+    // From RawVec soruce code, for compatibility
+    const MIN_NON_ZERO_CAP: usize = if core::mem::size_of::<T>() == 1 {
+        8
+    } else if core::mem::size_of::<T>() <= 1024 {
+        4
+    } else {
+        1
+    };
+
+    //This function attempts to keep the same semantics as Vec's reserve function
+    pub fn reserve(&mut self, additional: usize) {
+        if core::mem::size_of::<T>() == 0 {
+            // Since we return a capacity of `usize::MAX` when `elem_size` is
+            // 0, getting to here necessarily means the `AlignedVec` is overfull.
+            panic!("AlignedVec capacity overflow")
+        }
+
+        // Nothing we can really do about these checks, sadly.
+        let required_cap = self.vec.len().checked_add(additional);
+
+        if let Some(cap) = required_cap {
+            // This guarantees exponential growth. The doubling cannot overflow
+            // because `cap <= isize::MAX` and the type of `cap` is `usize`.
+            let cap = cmp::max(self.vec.capacity() * 2, cap);
+            let cap = cmp::max(Self::MIN_NON_ZERO_CAP, cap);
+
+            self.reserve_exact(cap - self.vec.len());
+        } else {
+            panic!("AlignedVec capacity overflow")
+        }
+    }
+
+    pub fn clear(&mut self) {
+        self.vec.clear();
+    }
+
+    pub fn get_slice(&self) -> ThinSimdAlignedSlicePtr<'_, T> {
+        // SAFETY:
+        // The vector represents an array of T with appropriate alignment.
+        // The vector is borrowed with an shared reference, guaranteeing only other shared references exist.
+        // The returned ThinSimdAlignedSlicePtr does not permit mutation of the vector, unless T is `UnsafeCell`,
+        // in which case all standard aliasing rules apply and this is passed onto the user.
+        // Therefore, the aliasing guarantees are upheld.
+        unsafe { self.vec.get_slice().as_deref() }
+    }
+}
+
+impl<T> Borrow<[T]> for SimdAlignedVec<T> {
+    fn borrow(&self) -> &[T] {
+        self
+    }
+}
+
+impl<T> BorrowMut<[T]> for SimdAlignedVec<T> {
+    fn borrow_mut(&mut self) -> &mut [T] {
+        self
+    }
+}
+
+impl<T> AsRef<[T]> for SimdAlignedVec<T> {
+    fn as_ref(&self) -> &[T] {
+        self
+    }
+}
+
+impl<T> AsMut<[T]> for SimdAlignedVec<T> {
+    fn as_mut(&mut self) -> &mut [T] {
+        self
+    }
+}
+
+impl<T> Deref for SimdAlignedVec<T> {
+    type Target = [T];
+
+    #[inline]
+    fn deref(&self) -> &[T] {
+        // SAFETY:
+        // The vector represents an array of T with appropriate alignment.
+        // The vector is borrowed with an shared reference, guaranteeing only other shared references exist.
+        // Therefore, it is safe to provide a shared reference to its contents.
+        unsafe {
+            std::slice::from_raw_parts(self.vec.get_ptr().as_ptr() as *const T, self.vec.len())
+        }
+    }
+}
+
+impl<T> DerefMut for SimdAlignedVec<T> {
+    #[inline]
+    fn deref_mut(&mut self) -> &mut [T] {
+        // SAFETY:
+        // The vector represents an array of T with appropriate alignment.
+        // The vector is borrowed with a mutable reference, guaranteeing uniqueness.
+        // Therefore, it is safe to provide a mutable reference to its contents.
+        unsafe {
+            core::slice::from_raw_parts_mut(
+                self.vec.get_ptr_mut().as_ptr() as *mut T,
+                self.vec.len(),
+            )
+        }
+    }
+}
+
+impl<'a, T> IntoIterator for &'a mut SimdAlignedVec<T> {
+    type Item = <&'a mut [T] as IntoIterator>::Item;
+
+    type IntoIter = <&'a mut [T] as IntoIterator>::IntoIter;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.as_mut().iter_mut()
+    }
+}
+
+impl<'a, T> IntoIterator for &'a SimdAlignedVec<T> {
+    type Item = <&'a [T] as IntoIterator>::Item;
+
+    type IntoIter = <&'a [T] as IntoIterator>::IntoIter;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.as_ref().iter()
+    }
+}
diff --git a/crates/bevy_ecs/src/storage/blob_vec.rs b/crates/bevy_ecs/src/storage/blob_vec.rs
index 384c26a07dc7af..2d610cc474c2a1 100644
--- a/crates/bevy_ecs/src/storage/blob_vec.rs
+++ b/crates/bevy_ecs/src/storage/blob_vec.rs
@@ -5,7 +5,9 @@ use std::{
     ptr::NonNull,
 };
 
-use bevy_ptr::{OwningPtr, Ptr, PtrMut};
+use bevy_ptr::batch::MAX_SIMD_ALIGNMENT;
+
+use crate::ptr::{batch, OwningPtr, Ptr, PtrMut, ThinSimdAlignedSlicePtr};
 
 /// A flat, type-erased data storage type
 ///
@@ -49,10 +51,23 @@ impl BlobVec {
         drop: Option<unsafe fn(OwningPtr<'_>)>,
         capacity: usize,
     ) -> BlobVec {
+        /*NOTE: Dangling pointers still need to be well aligned for the type when using slices (even though they are 0-length).
+                This is important for [`SimdAlignedVec`] and any function that would return a slice view of this BlobVec.
+
+                Since neither strict_provenance nor alloc_layout_extra is stable, there is no way to construct a NonNull::dangling()
+                pointer from `item_layout` without using a pointer cast.  This requires `-Zmiri-permissive-provenance` when testing,
+                otherwise Miri will issue a warning.
+
+          TODO: Rewrite this when strict_provenance or alloc_layout_extra is stable.
+        */
+
+        let dangling =
+            NonNull::new(item_layout.align().max(MAX_SIMD_ALIGNMENT) as *mut u8).unwrap();
+
         if item_layout.size() == 0 {
             BlobVec {
-                swap_scratch: NonNull::dangling(),
-                data: NonNull::dangling(),
+                swap_scratch: dangling,
+                data: dangling,
                 capacity: usize::MAX,
                 len: 0,
                 item_layout,
@@ -63,7 +78,7 @@ impl BlobVec {
                 .unwrap_or_else(|| std::alloc::handle_alloc_error(item_layout));
             let mut blob_vec = BlobVec {
                 swap_scratch,
-                data: NonNull::dangling(),
+                data: dangling,
                 capacity: 0,
                 len: 0,
                 item_layout,
@@ -300,13 +315,13 @@ impl BlobVec {
         unsafe { PtrMut::new(self.data) }
     }
 
-    /// Get a reference to the entire [`BlobVec`] as if it were an array with elements of type `T`
+    /// Get a reference to the entire [`BlobVec`] as if it were an array with elements of type `T`.
     ///
     /// # Safety
     /// The type `T` must be the type of the items in this [`BlobVec`].
-    pub unsafe fn get_slice<T>(&self) -> &[UnsafeCell<T>] {
+    pub unsafe fn get_slice<T>(&self) -> ThinSimdAlignedSlicePtr<UnsafeCell<T>> {
         // SAFETY: the inner data will remain valid for as long as 'self.
-        std::slice::from_raw_parts(self.data.as_ptr() as *const UnsafeCell<T>, self.len)
+        ThinSimdAlignedSlicePtr::new(self.data.as_ptr() as *mut UnsafeCell<T>, self.len)
     }
 
     pub fn clear(&mut self) {
@@ -353,6 +368,9 @@ impl Drop for BlobVec {
 fn array_layout(layout: &Layout, n: usize) -> Option<Layout> {
     let (array_layout, offset) = repeat_layout(layout, n)?;
     debug_assert_eq!(layout.size(), offset);
+
+    //Note: NEEDED for batching.  This is the layout of the array itself, not the layout of its elements.
+    let array_layout = array_layout.align_to(batch::MAX_SIMD_ALIGNMENT).unwrap();
     Some(array_layout)
 }
 
diff --git a/crates/bevy_ecs/src/storage/mod.rs b/crates/bevy_ecs/src/storage/mod.rs
index 6e848a042b492c..d894197f8e7ee7 100644
--- a/crates/bevy_ecs/src/storage/mod.rs
+++ b/crates/bevy_ecs/src/storage/mod.rs
@@ -1,5 +1,6 @@
 //! Storage layouts for ECS data.
 
+pub(super) mod aligned_vec;
 mod blob_vec;
 mod resource;
 mod sparse_set;
diff --git a/crates/bevy_ecs/src/storage/sparse_set.rs b/crates/bevy_ecs/src/storage/sparse_set.rs
index fdb9a21176a5ff..3988180dc39701 100644
--- a/crates/bevy_ecs/src/storage/sparse_set.rs
+++ b/crates/bevy_ecs/src/storage/sparse_set.rs
@@ -1,6 +1,7 @@
 use crate::{
     component::{ComponentId, ComponentInfo, ComponentTicks},
     entity::Entity,
+    storage::aligned_vec::SimdAlignedVec,
     storage::Column,
 };
 use bevy_ptr::{OwningPtr, Ptr};
@@ -244,8 +245,8 @@ impl ComponentSparseSet {
 /// `I` is the type of the indices, while `V` is the type of data stored in the dense storage.
 #[derive(Debug)]
 pub struct SparseSet<I, V: 'static> {
-    dense: Vec<V>,
-    indices: Vec<I>,
+    dense: SimdAlignedVec<V>,
+    indices: SimdAlignedVec<I>,
     sparse: SparseArray<I, usize>,
 }
 
@@ -255,10 +256,10 @@ impl<I: SparseSetIndex, V> Default for SparseSet<I, V> {
     }
 }
 impl<I, V> SparseSet<I, V> {
-    pub const fn new() -> Self {
+    pub fn new() -> Self {
         Self {
-            dense: Vec::new(),
-            indices: Vec::new(),
+            dense: SimdAlignedVec::new(),
+            indices: SimdAlignedVec::new(),
             sparse: SparseArray::new(),
         }
     }
@@ -267,8 +268,8 @@ impl<I, V> SparseSet<I, V> {
 impl<I: SparseSetIndex, V> SparseSet<I, V> {
     pub fn with_capacity(capacity: usize) -> Self {
         Self {
-            dense: Vec::with_capacity(capacity),
-            indices: Vec::with_capacity(capacity),
+            dense: SimdAlignedVec::with_capacity(capacity),
+            indices: SimdAlignedVec::with_capacity(capacity),
             sparse: Default::default(),
         }
     }
diff --git a/crates/bevy_ecs/src/storage/table.rs b/crates/bevy_ecs/src/storage/table.rs
index adb684469a6d6c..57a1004aff5059 100644
--- a/crates/bevy_ecs/src/storage/table.rs
+++ b/crates/bevy_ecs/src/storage/table.rs
@@ -2,9 +2,9 @@ use crate::{
     component::{ComponentId, ComponentInfo, ComponentTicks, Components},
     entity::Entity,
     query::DebugCheckedUnwrap,
-    storage::{blob_vec::BlobVec, SparseSet},
+    storage::{aligned_vec::SimdAlignedVec, blob_vec::BlobVec, SparseSet},
 };
-use bevy_ptr::{OwningPtr, Ptr, PtrMut};
+use bevy_ptr::{OwningPtr, Ptr, PtrMut, ThinSimdAlignedSlicePtr};
 use bevy_utils::HashMap;
 use std::alloc::Layout;
 use std::{
@@ -35,7 +35,7 @@ impl TableId {
 #[derive(Debug)]
 pub struct Column {
     data: BlobVec,
-    ticks: Vec<UnsafeCell<ComponentTicks>>,
+    ticks: SimdAlignedVec<UnsafeCell<ComponentTicks>>,
 }
 
 impl Column {
@@ -44,7 +44,7 @@ impl Column {
         Column {
             // SAFETY: component_info.drop() is valid for the types that will be inserted.
             data: unsafe { BlobVec::new(component_info.layout(), component_info.drop(), capacity) },
-            ticks: Vec::with_capacity(capacity),
+            ticks: SimdAlignedVec::with_capacity(capacity),
         }
     }
 
@@ -187,13 +187,13 @@ impl Column {
 
     /// # Safety
     /// The type `T` must be the type of the items in this column.
-    pub unsafe fn get_data_slice<T>(&self) -> &[UnsafeCell<T>] {
+    pub unsafe fn get_data_slice<T>(&self) -> ThinSimdAlignedSlicePtr<UnsafeCell<T>> {
         self.data.get_slice()
     }
 
     #[inline]
-    pub fn get_ticks_slice(&self) -> &[UnsafeCell<ComponentTicks>] {
-        &self.ticks
+    pub fn get_ticks_slice(&self) -> ThinSimdAlignedSlicePtr<UnsafeCell<ComponentTicks>> {
+        self.ticks.get_slice()
     }
 
     #[inline]
@@ -264,20 +264,20 @@ impl Column {
 
 pub struct Table {
     columns: SparseSet<ComponentId, Column>,
-    entities: Vec<Entity>,
+    entities: SimdAlignedVec<Entity>,
 }
 
 impl Table {
     pub(crate) fn with_capacity(capacity: usize, column_capacity: usize) -> Table {
         Self {
             columns: SparseSet::with_capacity(column_capacity),
-            entities: Vec::with_capacity(capacity),
+            entities: SimdAlignedVec::with_capacity(capacity),
         }
     }
 
     #[inline]
-    pub fn entities(&self) -> &[Entity] {
-        &self.entities
+    pub fn entities(&self) -> ThinSimdAlignedSlicePtr<'_, Entity> {
+        self.entities.get_slice()
     }
 
     pub(crate) fn add_column(&mut self, component_info: &ComponentInfo) {
@@ -462,6 +462,20 @@ impl Table {
         self.columns.capacity()
     }
 
+    #[inline]
+    pub fn batchable_region_end<const N: usize>(&self) -> usize {
+        //Critical invariant: each Component storage is aligned to MAX_SIMD_ALIGNMENT
+        //and each component in the query can be batched e.g., for (Q1, Q2), both Q1 and Q2 issue aligned batches
+        //Therefore, for the given query, the batch size of N is valid.
+
+        //The components are divided into [batchable region][scalar region]
+        //Given the above invariants, the euclidian division of table.len() = bN + s, 0 <= s < N, gives
+        //b batches with a scalar region of s
+        //Therefore, the batch region of indices is from [0, bN), and the scalar is [bN, table.len())
+
+        (self.entity_count() / N) * N
+    }
+
     #[inline]
     pub fn is_empty(&self) -> bool {
         self.entities.is_empty()
diff --git a/crates/bevy_ecs/src/system/query.rs b/crates/bevy_ecs/src/system/query.rs
index 8f7e4f70bc3035..88c11c89117e67 100644
--- a/crates/bevy_ecs/src/system/query.rs
+++ b/crates/bevy_ecs/src/system/query.rs
@@ -1,9 +1,11 @@
 use crate::{
     component::Component,
     entity::Entity,
+    ptr::elain::{Align, Alignment},
     query::{
-        QueryCombinationIter, QueryEntityError, QueryIter, QueryManyIter, QuerySingleError,
-        QueryState, ROQueryItem, ReadOnlyWorldQuery, WorldQuery,
+        QueryBatch, QueryCombinationIter, QueryEntityError, QueryItem, QueryIter, QueryManyIter,
+        QuerySingleError, QueryState, ROQueryBatch, ROQueryItem, ReadOnlyWorldQuery, WorldQuery,
+        WorldQueryBatch,
     },
     world::{Mut, World},
 };
@@ -687,6 +689,29 @@ impl<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> Query<'w, 's, Q, F> {
         };
     }
 
+    /// See [`QueryState<Q,F>::for_each_batched`](QueryState<Q,F>::for_each_batched) for how to use this function.
+    #[inline]
+    pub fn for_each_batched<'a, const N: usize, const MIN_ALIGN: usize>(
+        &'a mut self,
+        func: impl FnMut(ROQueryItem<'a, Q>),
+        func_batch: impl FnMut(ROQueryBatch<'a, Q, N, MIN_ALIGN>),
+    ) where
+        <Q as WorldQuery>::ReadOnly: WorldQueryBatch,
+        Align<MIN_ALIGN>: Alignment,
+    {
+        // SAFETY: system runs without conflicts with other systems. same-system queries have runtime
+        // borrow checks when they conflict
+        unsafe {
+            self.state.as_readonly().for_each_unchecked_manual_batched(
+                self.world,
+                func,
+                func_batch,
+                self.last_change_tick,
+                self.change_tick,
+            );
+        };
+    }
+
     /// Runs `f` on each query item.
     ///
     /// # Example
@@ -725,6 +750,231 @@ impl<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> Query<'w, 's, Q, F> {
         };
     }
 
+    /// This is a "batched" version of [`for_each_mut`](Self::for_each_mut) that accepts a batch size `N` together with a desired alignment for the batches `ALIGN`.
+    /// The advantage of using batching in queries is that it enables SIMD acceleration (vectorization) of your code to help you meet your performance goals.
+    /// This function accepts two arguments, `func`, and `func_batch` which represent the "scalar" and "vector" (or "batched") paths of your code respectively.
+    /// Each "batch" contains `N` query results, in order, with a guaranteed alignment for the batch to aid in vectorization of the query.
+    ///
+    /// # A very brief introduction to SIMD
+    ///
+    /// SIMD, or Single Instruction, Multiple Data, is a paradigm that allows a single instruction to operate on multiple datums in parallel.
+    /// It is most commonly seen in "vector" instruction set extensions such as AVX and NEON, where it is possible to, for example, add
+    /// two arrays of `[f32; 4]` together in a single instruction.  When used appropriately, SIMD is a very powerful tool that can greatly accelerate certain types of workloads.
+    /// An introductory treatment of SIMD can be found [on Wikipedia](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) for interested readers.
+    ///
+    /// [Vectorization](https://stackoverflow.com/questions/1422149/what-is-vectorization) is an informal term to describe optimizing code to leverage these SIMD instruction sets.
+    ///
+    /// # Just what is this alignment thing, anyway?
+    ///
+    /// [This chapter](https://doc.rust-lang.org/reference/type-layout.html) of the Rust reference is a great treatment on alignment.
+    ///
+    /// Vector instructions often come with memory operand alignment restrictions that make their usage more complicated, and these
+    /// are typically based on the vector size in bytes.  For example, a `[f32; 4]` vector is 16 bytes long, and in SSE4 some instructions
+    /// taking a `[f32; 4]` memory operand require it to be 16 bytes aligned.  "Unaligned moves" are possible, but may carry a performance
+    /// penalty under certain circumstances.  If the compiler can't prove that a memory operand will be aligned appropriately, it must assume
+    /// the worst case and emit code expecting unaligned data.
+    ///
+    /// Fortunately, you can provide a guaranteed batch alignment using the `ALIGN` parameter.  This ensures that your batch
+    /// alignment is at least `ALIGN` (a compile-time assertion ensures if this is not possible, the build cannot succeed).
+    /// The result is that the Rust compiler can directly see that your reads and writes are aligned appropriately
+    /// and enable usage of vector instructions with aligned memory operands. That is, Bevy proves that memory operands will be aligned to at least `ALIGN`,
+    /// enabling greater optimization potential, and also helping you if you choose to write SIMD intrinsics directly.
+    ///
+    /// When generalized const expressions are stable, the guaranteed alignment of your batch will be calculated for you automatically
+    /// and you won't need to pass in the `ALIGN` parameter.
+    ///
+    /// # When should I consider batching for my query?
+    ///
+    /// The first thing you should consider is if you are meeting your performance goals.  Batching a query is fundamentally an optimization, and if your application is meeting performance requirements
+    /// already, then (other than for your own entertainment) you won't get much benefit out of batching.  If you are having performance problems though, the next step is to
+    /// use a [profiler](https://nnethercote.github.io/perf-book/profiling.html) to determine the running characteristics of your code.
+    /// If, after profiling your code, you have determined that a substantial amount of time is being processing a query, and it's hindering your performance goals,
+    /// then it might be worth it to consider batching to meet them.
+    ///
+    /// One of the main tradeoffs with batching your queries is that there will be an increased complexity from maintaining both code paths: `func` and `func_batch`
+    /// semantically should be doing the same thing, and it should always be possible to interchange them without visible program effects.
+    ///
+    /// # What kinds of queries make sense to batch?
+    ///
+    /// Usually math related ones: an example is given below showing how to accelerate a simple `position += velocity * time` calculation using batching.
+    /// Anything involving floats is a possible candidate.  Depending on your component layout, you may need to perform a data layout conversion
+    /// to batch the query optimally.  This Wikipedia page on ["array of struct" and "struct of array" layouts](https://en.wikipedia.org/wiki/AoS_and_SoA) is a good starter on
+    /// this topic, as is this [Intel blog post](https://www.intel.com/content/www/us/en/developer/articles/technical/memory-layout-transformations.html).
+    /// The example below uses data layout conversion.
+    ///
+    /// Vectorizing code can be a very deep subject to get into.
+    /// Sometimes it can be very straightfoward to accomplish what you want to do, and other times it takes a bit of playing around to make your problem fit the SIMD model.
+    ///
+    /// # Will batching always make my queries faster?
+    ///
+    /// Unfortunately it will not.  A suboptimally written batched query will probably perform worse than a straightforward `for_each_mut` query.  Data layout conversion,
+    /// for example, carries overhead that may not always be worth it. Fortunately, your profiler can help you identify these situations.
+    ///
+    /// Think of batching as a tool in your performance toolbox rather than the preferred way of writing your queries.
+    ///
+    /// # What kinds of queries are batched right now?
+    ///
+    /// Currently, only "Dense" queries are actually batched; other queries will only use `func` and never call `func_batch`.  This will improve
+    /// in the future.
+    ///
+    /// # Usage:
+    ///
+    /// * Supported values of `ALIGN` are 16, 32, and 64
+    /// * `N` must be a power of 2
+    /// * `func_batch` receives components in "batches" that are aligned to `ALIGN`.
+    /// * `func` functions exactly as does in [`for_each_mut`](Self::for_each_mut) -- it receives "scalar" (non-batched) components.
+    ///
+    /// In other words, `func_batch` composes the "fast path" of your query, and `func` is the "slow path".
+    ///
+    /// Batches are currently one of [`AlignedBatch16`](bevy_ptr::batch::AlignedBatch16), [`AlignedBatch32`](bevy_ptr::batch::AlignedBatch32),
+    /// or [`AlignedBatch64`](bevy_ptr::batch::AlignedBatch64) types, each corresponding to a guaranteed batch alignment.
+    /// The batch alignment is important as it enables architecture-specific optimizations that depend on alignment.
+    ///
+    /// If you attempt to call this function with an invalid generic argument, a compile-time error will be issued.  For example,
+    /// if you request 64-bytes alignment on a batch whose size is 8 bytes, your program will fail to compile.  In general,
+    /// choose `ALIGN` as large as you can for your batch parameters.  The minimum alignment supported for batches is 16 bytes.
+    ///
+    /// **Note**: Once generalized const expressions are stable, it will be possible to compute `ALIGN` automatically based on the component type and batch size.
+    /// Compile-time assertions will still exist for sanity checking.
+    ///
+    /// In general, when using this function, be mindful of the types of filters being used with your query, as these can fragment your batches
+    /// and cause the scalar path to be taken more often.
+    ///
+    /// **Note**: It is always valid for the implementation of this function to only call `func`.  Currently, batching is only supported for "Dense" queries.
+    /// Calling this function on any other query type will result in only the slow path being executed (e.g., queries with Sparse components.)
+    /// More query types may become batchable in the future.
+    ///
+    /// **Note**: Although this function provides the groundwork for writing performance-portable SIMD code, you will still need to take into account
+    /// your target architecture's capabilities.  The batch size will likely need to be tuned for your application, for example.
+    /// When SIMD becomes stabilized in Rust, it will be possible to write code that is generic over the batch width, but some degree of tuning will likely always be
+    /// necessary.  Think of this as a tool at your disposal to meet your performance goals.
+    ///
+    ///  # Example: Acclerate a simple "`Position += Velocity * time`" calculation using SIMD, where Position and Velocity are represented using `Vec3`s.
+    ///  A batch width of 4 is chosen to match SSE4 and allow the usage of the `Vec4` type to perform SIMD.  This example uses the [`as_inner`](bevy_ptr::batch::AlignedBatch16<f32,4>::as_inner)
+    ///  functions to easily and efficiently process components with a single member.
+    ///
+    /// ```rust
+    /// use bevy_ecs::prelude::*;
+    /// use bevy_ptr::batch::AlignedBatch;
+    /// use bevy_ptr::bytemuck::TransparentWrapper;
+    /// use bevy_math::{Vec3,Vec4};
+    ///
+    /// #[derive(Clone, Copy, Component, PartialEq, Debug)]
+    /// //We want repr(transparent) here to access the `as_inner` AlignedBatch functions.
+    /// #[repr(transparent)]
+    /// struct Position(Vec3);
+    ///
+    /// //Important: this allows us to "cast away" the outer layer of Position
+    /// // SAFETY: Position is repr(transparent) and contains a Vec3
+    /// unsafe impl TransparentWrapper<Vec3> for Position {}
+    ///
+    /// #[derive(Clone, Copy, Component, PartialEq, Debug)]
+    /// //We want repr(transparent) here to access the `as_inner` AlignedBatch functions.
+    /// #[repr(transparent)]
+    /// struct Velocity(Vec3);
+    ///
+    /// //Important: this allows us to "cast away" the outer layer of Velocity
+    /// // SAFETY: Velocity is repr(transparent) and contains a Vec3
+    /// unsafe impl TransparentWrapper<Vec3> for Velocity {}
+    ///
+    /// //Convert the AoS representation to an SoA representation amenable to SIMD operations
+    /// //This uses the `AsRef` trait to generically deal with different alignments
+    /// //on size 4 batches.
+    /// fn aos_to_soa(aos: &impl AsRef<[Vec3; 4]>) -> [Vec4; 3]
+    /// {
+    ///     let [p0, p1, p2, p3] = aos.as_ref();
+    ///
+    ///     let xs = Vec4::new(p0.x, p1.x, p2.x, p3.x);
+    ///     let ys = Vec4::new(p0.y, p1.y, p2.y, p3.y);
+    ///     let zs = Vec4::new(p0.z, p1.z, p2.z, p3.z);
+    ///
+    ///     [xs, ys, zs]
+    /// }
+    ///
+    /// //Convert the SoA representation back to AoS for storing back into the ECS.
+    /// fn soa_to_aos(aos: &[Vec4; 3]) -> [Vec3; 4]
+    /// {
+    ///     let [xs, ys, zs] = aos;
+    ///
+    ///     let p0 = Vec3::new(xs.x, ys.x, zs.x);
+    ///     let p1 = Vec3::new(xs.y, ys.y, zs.y);
+    ///     let p2 = Vec3::new(xs.z, ys.z, zs.z);
+    ///     let p3 = Vec3::new(xs.w, ys.w, zs.w);
+    ///
+    ///     [p0, p1, p2, p3]
+    /// }
+    ///
+    /// const DELTA_TIMESTEP: f32 = 1.0/60.0;
+    ///
+    /// fn position_update_system(mut query: Query<(&mut Position, &Velocity)>)
+    /// {
+    ///     //Execute this query in batches of 4 components with an alignment of 16.
+    ///     //Note that if this is not possible, the program will fail to compile.
+    ///     //Try changing "16" to "32" here, for example, and observe what happens.
+    ///     //In the future, the alignment will able to be computed for you when
+    ///     //generic const expressions are stable.
+    ///     query.for_each_mut_batched::<4, 16>(|(mut position, velocity)|
+    ///     {
+    ///         //The scalar path -- this is only executed for the epilogue or for when
+    ///         //filters are present that would fragment the query
+    ///         position.0 += DELTA_TIMESTEP * velocity.0;
+    ///     },
+    ///     |(mut position, velocity)|
+    ///     {
+    ///         //The batched path -- `position` and `velocity` are now AlignedBatch types.
+    ///         //This uses SIMD to perform the calculation.
+    ///         //Note that this example is designed around SSE4.
+    ///         //For AVX2, for example, you would want batches of 8 `f32`s.
+    ///         //When SIMD is stabilized in rust, it could be replaced with generic vector-width code.
+    ///
+    ///         //NOTE: change trackers currently will add overhead.
+    ///         //If this is a problem, uncomment the below:
+    ///         //let position = position.bypass_change_detection();
+    ///
+    ///         //This lets us treat our batch of Position as a batch of Vec3
+    ///         let mut ps = position.as_inner_mut::<Vec3>();
+    ///
+    ///         // Turn [Vec3; 4] into [Vec4; 3] form for SIMD acceleration
+    ///         let [ps_x, ps_y, ps_z] = aos_to_soa(ps);
+    ///         let [vs_x, vs_y, vs_z] = aos_to_soa(velocity.as_inner::<Vec3>());
+    ///         //NOTE: the above can also be achieved similarly using velocity.map(|c| c.0)
+    ///
+    ///         //Now that we have our [Vec4; 3], this line will optimize well using SIMD
+    ///         let newps = [ps_x + vs_x * DELTA_TIMESTEP,
+    ///                      ps_y + vs_y * DELTA_TIMESTEP,
+    ///                      ps_z + vs_z * DELTA_TIMESTEP];
+    ///
+    ///         //But our intermediate work is still in SoA representation!
+    ///         //We still need to translate it back.
+    ///         //"into" is used to convert the [Vec3; 4] into an AlignedBatch16<Vec3, 4>.
+    ///         //It *should* optimize right out.
+    ///         *ps = soa_to_aos(&newps).into();
+    ///     });
+    /// }
+    /// # bevy_ecs::system::assert_is_system(position_update_system);
+    /// ```
+    #[inline]
+    pub fn for_each_mut_batched<'a, const N: usize, const MIN_ALIGN: usize>(
+        &'a mut self,
+        func: impl FnMut(QueryItem<'a, Q>),
+        func_batch: impl FnMut(QueryBatch<'a, Q, N, MIN_ALIGN>),
+    ) where
+        Q: WorldQueryBatch,
+        Align<MIN_ALIGN>: Alignment,
+    {
+        // SAFETY: system runs without conflicts with other systems. same-system queries have runtime
+        // borrow checks when they conflict
+        unsafe {
+            self.state.for_each_unchecked_manual_batched(
+                self.world,
+                func,
+                func_batch,
+                self.last_change_tick,
+                self.change_tick,
+            );
+        };
+    }
+
     /// Runs `f` on each read-only query item in parallel.
     ///
     /// Parallelization is achieved by using the [`World`]'s [`ComputeTaskPool`].
diff --git a/crates/bevy_ecs/src/world/world_cell.rs b/crates/bevy_ecs/src/world/world_cell.rs
index 7679e51c407303..6c292ef29e3ed2 100644
--- a/crates/bevy_ecs/src/world/world_cell.rs
+++ b/crates/bevy_ecs/src/world/world_cell.rs
@@ -33,7 +33,7 @@ impl Default for ArchetypeComponentAccess {
 const UNIQUE_ACCESS: usize = 0;
 const BASE_ACCESS: usize = 1;
 impl ArchetypeComponentAccess {
-    const fn new() -> Self {
+    fn new() -> Self {
         Self {
             access: SparseSet::new(),
         }
diff --git a/crates/bevy_ptr/Cargo.toml b/crates/bevy_ptr/Cargo.toml
index 04b7e30fc9b5b4..671b8165c44abc 100644
--- a/crates/bevy_ptr/Cargo.toml
+++ b/crates/bevy_ptr/Cargo.toml
@@ -9,3 +9,6 @@ license = "MIT OR Apache-2.0"
 keywords = ["bevy", "no_std"]
 
 [dependencies]
+elain = "0.3"
+bytemuck = "1.12"
+bevy_math = { path = "../bevy_math", version = "0.9.0-dev" }
\ No newline at end of file
diff --git a/crates/bevy_ptr/src/batch.rs b/crates/bevy_ptr/src/batch.rs
new file mode 100644
index 00000000000000..c26ffe7cd07e0b
--- /dev/null
+++ b/crates/bevy_ptr/src/batch.rs
@@ -0,0 +1,360 @@
+use crate::ThinSimdAlignedSlicePtr;
+
+use core::{
+    cell::UnsafeCell,
+    marker::PhantomData,
+    ops::{Index, IndexMut},
+};
+
+use crate::bytemuck;
+
+use elain::{Align, Alignment};
+
+use bevy_math::Vec4;
+
+/*
+NOTE: We define this constant here as both [`bevy_ptr`] and [`bevy_ecs`] need to know about it.
+
+If this is a problem, this can be replaced with code that looks something like the following:
+
+    #[cfg(all(any(target_feature = "avx"), not(target_feature = "avx512f")))]
+    pub const MAX_SIMD_ALIGNMENT: usize = 32;
+
+    #[cfg(any(target_feature = "avx512f"))]
+    pub const MAX_SIMD_ALIGNMENT: usize = 64;
+
+    //All platforms get 16-byte alignment on tables guaranteed.
+    #[cfg(not(any(target_feature = "avx512f")))]
+    pub const MAX_SIMD_ALIGNMENT: usize = 16;
+*/
+/// The maximum SIMD alignment for a given target.
+/// `MAX_SIMD_ALIGNMENT` is 64 for the following reasons:
+///  1. This ensures that table columns are aligned to cache lines on x86
+///  2. 64 is the maximum alignment required to use all instructions on all known CPU architectures.
+///     This simplifies greatly handling cross platform alignment on a case by case basis; by aligning to the worst case, we align for all cases
+///  3. The overhead of aligning columns to 64 bytes is very small as columns will in general be much larger than this
+pub const MAX_SIMD_ALIGNMENT: usize = 64;
+
+//TODO: AoSoA representations
+
+//TODO: when possible, compute alignments automatically using the GCD (requires generic const expressions) and ensure the
+//batch lambda accepts arguments of AlignedBatchTrait<T, N> directly. This will let the alignment be automatically computed
+//and allow different query elements to have different alignments.
+
+//FIXME: reword for struct def
+/// This trait allows generic code using [`AlignedBatchT`] with varying sizes and alignments
+/// to call methods on the batches.
+/// This trait is sealed to allow for non-breaking changes in the future (such as when Rust SIMD is stabilized).
+/// The sealed trait design pattern is documented in the [Rust book](https://rust-lang.github.io/api-guidelines/future-proofing.html).
+// FIXME: what if align(T) > MIN_ALIGN?
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct AlignedBatch<T, const N: usize, const MIN_ALIGN: usize>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    _align: Align<MIN_ALIGN>,
+    batch: [T; N],
+}
+
+impl<T, const N: usize, const MIN_ALIGN: usize> AlignedBatch<T, N, MIN_ALIGN>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    //These only make sense when a component is #[repr(transparent)].
+    //For example, if you had a repr(transparent) Position component that contained a Vec3, it
+    //would be semantically valid to get a reference to the inner component.
+    //The TransparentWrapper unsafe trait from the bytemuck crate is used to make this process usable in user code.
+
+    /// If `T` is `repr(transparent)`, then `as_inner` can be used to get a shared reference to an "inner view" of the batch.
+    /// To use this, implement [`bytemuck::TransparentWrapper`]  for your component.
+    /// For example, if you had a `repr(transparent)` `Position` component that contained a [`bevy_math::Vec3`], you could treat a batch of `Position` as a batch of [`bevy_math::Vec3`].
+    #[inline]
+    pub fn as_inner<Inner>(&self) -> &AlignedBatch<Inner, N, MIN_ALIGN>
+    where
+        T: bytemuck::TransparentWrapper<Inner>,
+    {
+        // SAFETY:
+        //
+        // * T is repr(transparent), with inner type Inner
+        // * $batch_type<T, N> is repr(transparent)
+        // * $batch_type<Inner, N> is repr(transparent)
+        // * Therefore $batch_type<T, N> and $batch_type<Inner, N>
+        // * Since self is a shared reference, creating more shared references to the same memory is OK.
+        unsafe { &*(self as *const Self as *const AlignedBatch<Inner, N, MIN_ALIGN>) }
+    }
+
+    /// If `T` is `repr(transparent)`, then `as_inner_mut` can be used to get a mutable reference to an "inner view" of the batch.
+    /// To use this, implement [`bytemuck::TransparentWrapper`] for your component.
+    /// For example, if you had a `repr(transparent)` `Position` component that contained a [`bevy_math::Vec3`], you could treat a batch of `Position` as a batch of [`bevy_math::Vec3`].
+    #[inline]
+    pub fn as_inner_mut<Inner>(&mut self) -> &mut AlignedBatch<Inner, N, MIN_ALIGN>
+    where
+        T: bytemuck::TransparentWrapper<Inner>,
+    {
+        // SAFETY:
+        //
+        // * Recommended pattern from the Rust book: https://doc.rust-lang.org/std/mem/fn.transmute.html
+        //   * Section: "turning an &mut T into an &mut U"
+        // * T is repr(transparent), with inner type Inner
+        // * $batch_type<T, N> is repr(transparent)
+        // * $batch_type<Inner, N> is repr(transparent)
+        // * Therefore $batch_type<T, N> and $batch_type<Inner, N>
+        unsafe { &mut *(self as *mut Self as *mut AlignedBatch<Inner, N, MIN_ALIGN>) }
+    }
+
+    /// Constructs a new batch with the result of `func` mapped over the components of this batch.
+    //TODO: this doesn't optimize very well...
+    #[inline]
+    pub fn map<U, F: Fn(T) -> U>(self, func: F) -> AlignedBatch<U, N, MIN_ALIGN> {
+        AlignedBatch::<U, N, MIN_ALIGN> {
+            _align: self._align,
+            batch: self.batch.map(func),
+        }
+    }
+
+    /// Retrieve a shared reference to this batch as an array of `[T; N]`
+    /// You can use this to destructure your batch into elements if needed.
+    #[inline]
+    pub fn as_array(&self) -> &[T; N] {
+        self.as_ref()
+    }
+
+    /// Retrieve a mutable reference to this batch as an array of `[T; N]`.
+    /// You can use this to modify elements of your batch.
+    #[inline]
+    pub fn as_array_mut(&mut self) -> &mut [T; N] {
+        self.as_mut()
+    }
+
+    /// Convert this batch into an array of `[T; N]`.
+    /// A convenience function, as all batches implement [`From`] and [`Into`] for `[T; N]`.
+    #[inline]
+    pub fn into_array(self) -> [T; N] {
+        self.into()
+    }
+
+    //TODO: add support for as_simd()/into_simd() when SIMD is stabilized!
+}
+
+impl<T, const N: usize, const MIN_ALIGN: usize> From<[T; N]> for AlignedBatch<T, N, MIN_ALIGN>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    #[inline]
+    fn from(batch: [T; N]) -> Self {
+        Self {
+            _align: Align::NEW,
+            batch,
+        }
+    }
+}
+
+impl<T, const N: usize, const MIN_ALIGN: usize> From<AlignedBatch<T, N, MIN_ALIGN>> for [T; N]
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    #[inline]
+    fn from(v: AlignedBatch<T, N, MIN_ALIGN>) -> Self {
+        v.batch
+    }
+}
+
+impl<T, const N: usize, const MIN_ALIGN: usize> Index<usize> for AlignedBatch<T, N, MIN_ALIGN>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    type Output = T;
+
+    #[inline]
+    fn index(&self, i: usize) -> &<Self as Index<usize>>::Output {
+        &self.batch[i]
+    }
+}
+
+impl<T, const N: usize, const MIN_ALIGN: usize> IndexMut<usize> for AlignedBatch<T, N, MIN_ALIGN>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    #[inline]
+    fn index_mut(&mut self, i: usize) -> &mut <Self as Index<usize>>::Output {
+        &mut self.batch[i]
+    }
+}
+
+impl<T, const N: usize, const MIN_ALIGN: usize> AsRef<[T; N]> for AlignedBatch<T, N, MIN_ALIGN>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    fn as_ref(&self) -> &[T; N] {
+        &self.batch
+    }
+}
+
+impl<T, const N: usize, const MIN_ALIGN: usize> AsMut<[T; N]> for AlignedBatch<T, N, MIN_ALIGN>
+where
+    Align<MIN_ALIGN>: Alignment,
+{
+    fn as_mut(&mut self) -> &mut [T; N] {
+        &mut self.batch
+    }
+}
+
+// TODO: when stable, replace `ALIGN` with an `Alignment` enum
+// OR when general const expressions are stable, replace with a trait constraint `Alignment`.
+// Do the same for batch sizes. For now, this is the best we can do.
+
+//Convenience impls that can go away when SIMD is stabilized
+impl AsRef<Vec4> for AlignedBatch<f32, 4, 16> {
+    #[inline]
+    fn as_ref(&self) -> &Vec4 {
+        // SAFETY:
+        // * Alignment of Vec4 is 16
+        // * Alignment of Self is 16
+        // * Self is repr(C) and therefore can be treated as an [f32; 4]
+        // * Vec4 is repr(transparent) and can be treated as an [f32; 4] (it is an __mm128)
+        // * Only shared refs exist
+        // Therefore this cast is sound.
+        unsafe { &*(self as *const Self as *const Vec4) }
+    }
+}
+
+impl AsMut<Vec4> for AlignedBatch<f32, 4, 16> {
+    #[inline]
+    fn as_mut(&mut self) -> &mut Vec4 {
+        // SAFETY:
+        // * Alignment of Vec4 is 16
+        // * Alignment of Self is 16
+        // * Self is repr(C) and therefore can be treated as an [f32; 4]
+        // * Vec4 is repr(transparent) and can be treated as an [f32; 4] (it is an __mm128)
+        // * &mut T to &mut U pattern used from the Rust book to ensure soundness when casting mutable refs
+        // Therefore this cast is sound.
+        unsafe { &mut *(self as *mut Self as *mut Vec4) }
+    }
+}
+
+impl<'a, T> ThinSimdAlignedSlicePtr<'a, T> {
+    /// Indexes the slice without doing bounds checks with a batch size of `N`.
+    /// The batch size in bytes must be a multiple of `ALIGN`.
+    /// A compile-time error will be given if the alignment requirements cannot be met with the given parameters.
+    ///
+    /// # Safety
+    /// `index` must be in-bounds.
+    /// `index` must be a multiple of `N`.
+    #[inline]
+    unsafe fn get_batch_aligned_raw<const N: usize, const MIN_ALIGN: usize>(
+        self,
+        index: usize,
+        _len: usize,
+    ) -> *const AlignedBatch<T, N, MIN_ALIGN>
+    where
+        Align<MIN_ALIGN>: Alignment,
+    {
+        //Can't use this method if the batch doesn't make sense (performance safety).
+        //STATIC ASSERTIONS.  Rust has no support for these directly...
+        #[allow(clippy::let_unit_value)]
+        let _ = Assert::<T, N, MIN_ALIGN>::YOUR_BATCH_SIZE_IS_NOT_A_MULTIPLE_OF_ALIGN;
+        #[allow(clippy::let_unit_value)]
+        let _ = Assert::<T, N, MIN_ALIGN>::MIN_ALIGN_IS_NOT_A_POWER_OF_TWO; //FIXME: no longer needed
+        #[allow(clippy::let_unit_value)]
+        let _ = Assert::<T, N, MIN_ALIGN>::BATCH_SIZE_IS_NOT_A_POWER_OF_TWO; //FIXME: not actually required
+        #[allow(clippy::let_unit_value)]
+        let _ = Assert::<T, N, MIN_ALIGN>::MIN_ALIGN_IS_LESS_THAN_ALIGN_OF_T; //FIXME: no longer needed
+
+        #[cfg(debug_assertions)]
+        debug_assert!(index + N < self.len);
+        #[cfg(debug_assertions)]
+        debug_assert_eq!(_len, self.len);
+        #[cfg(debug_assertions)]
+        debug_assert_eq!(index % N, 0);
+
+        let off_ptr = self.ptr.as_ptr().add(index);
+
+        //NOTE: ZSTs may cause this "slice" to point into nothingness.
+        //This sounds dangerous, but won't cause harm as nothing
+        //will actually access anything "in the slice"
+
+        //TODO: when pointer_is_aligned is standardized, we can just use ptr::is_aligned()
+        #[cfg(debug_assertions)]
+        debug_assert_eq!(off_ptr as usize % MIN_ALIGN, 0);
+
+        //SAFETY: off_ptr is not null
+        off_ptr as *const AlignedBatch<T, N, MIN_ALIGN>
+    }
+
+    /// Indexes the slice without doing bounds checks with a batch size of N.
+    ///
+    /// # Safety
+    /// `index` must be in-bounds.
+    /// `index` must be suitably aligned.
+    #[inline]
+    pub unsafe fn get_batch_aligned<const N: usize, const MIN_ALIGN: usize>(
+        self,
+        index: usize,
+        len: usize,
+    ) -> &'a AlignedBatch<T, N, MIN_ALIGN>
+    where
+        Align<MIN_ALIGN>: Alignment,
+    {
+        &(*self.get_batch_aligned_raw(index, len))
+    }
+}
+
+impl<'a, T> ThinSimdAlignedSlicePtr<'a, UnsafeCell<T>> {
+    /// Indexes the slice without doing bounds checks with a batch size of `N`.
+    /// The semantics are like `UnsafeCell` -- you must ensure the aliasing constraints are met.
+    ///
+    /// # Safety
+    /// `index` must be in-bounds.
+    /// `index` must be a multiple of `N`.
+    ///  No other references exist to the batch of size `N` at `index`
+    #[inline]
+    pub unsafe fn get_batch_aligned_deref_mut<const N: usize, const MIN_ALIGN: usize>(
+        self,
+        index: usize,
+        len: usize,
+    ) -> &'a mut AlignedBatch<T, N, MIN_ALIGN>
+    where
+        Align<MIN_ALIGN>: Alignment,
+    {
+        &mut *(self
+            .as_deref()
+            .get_batch_aligned_raw::<N, MIN_ALIGN>(index, len)
+            as *mut AlignedBatch<T, N, MIN_ALIGN>)
+    }
+
+    /// Indexes the slice without doing bounds checks with a batch size of `N`.
+    /// The semantics are like `UnsafeCell` -- you must ensure the aliasing constraints are met.
+    ///
+    /// # Safety
+    /// `index` must be in-bounds.
+    /// `index` must be a multiple of `N`.
+    /// No mutable references exist to the batch of size `N` at `index`
+    #[inline]
+    pub unsafe fn get_batch_aligned_deref<const N: usize, const MIN_ALIGN: usize>(
+        self,
+        index: usize,
+        len: usize,
+    ) -> &'a AlignedBatch<T, N, MIN_ALIGN>
+    where
+        Align<MIN_ALIGN>: Alignment,
+    {
+        &*(self
+            .as_deref()
+            .get_batch_aligned_raw::<N, MIN_ALIGN>(index, len))
+    }
+}
+
+//Inspired from: https://github.com/rust-lang/rust/issues/57775#issuecomment-1098001375
+struct Assert<T, const N: usize, const MIN_ALIGN: usize> {
+    _marker: PhantomData<T>,
+}
+
+impl<T, const N: usize, const MIN_ALIGN: usize> Assert<T, N, MIN_ALIGN> {
+    const YOUR_BATCH_SIZE_IS_NOT_A_MULTIPLE_OF_ALIGN: () =
+        assert!((N * core::mem::size_of::<T>()) % MIN_ALIGN == 0);
+    const MIN_ALIGN_IS_NOT_A_POWER_OF_TWO: () = assert!(MIN_ALIGN.is_power_of_two());
+    const BATCH_SIZE_IS_NOT_A_POWER_OF_TWO: () = assert!(N.is_power_of_two());
+    const MIN_ALIGN_IS_LESS_THAN_ALIGN_OF_T: () = assert!(MIN_ALIGN >= core::mem::align_of::<T>());
+}
diff --git a/crates/bevy_ptr/src/lib.rs b/crates/bevy_ptr/src/lib.rs
index 8f1f476a906e2b..553cd22f907743 100644
--- a/crates/bevy_ptr/src/lib.rs
+++ b/crates/bevy_ptr/src/lib.rs
@@ -2,8 +2,18 @@
 #![no_std]
 #![warn(missing_docs)]
 
+//TransparentWrapper is required for as_inner* conversions
+//Re-exported from the [`bytemuck`] crate to ensure users can implement this trait for their own types
+pub use bytemuck;
+
+//Re-exported the [`elain`] crate to ensure users can generically deal with alignments when using AlignedBatches
+pub use elain;
+
 use core::{cell::UnsafeCell, marker::PhantomData, mem::MaybeUninit, ptr::NonNull};
 
+/// Batch-related functionality
+pub mod batch;
+
 /// Type-erased borrow of some unknown type chosen when constructing this type.
 ///
 /// This type tries to act "borrow-like" which means that:
@@ -125,6 +135,7 @@ impl<'a> Ptr<'a> {
         self.0.as_ptr()
     }
 }
+
 impl_ptr!(PtrMut);
 impl<'a> PtrMut<'a> {
     /// Transforms this [`PtrMut`] into an [`OwningPtr`]
@@ -155,6 +166,7 @@ impl<'a> PtrMut<'a> {
         self.0.as_ptr()
     }
 }
+
 impl_ptr!(OwningPtr);
 impl<'a> OwningPtr<'a> {
     /// Consumes a value and creates an [`OwningPtr`] to it while ensuring a double drop does not happen.
@@ -195,29 +207,77 @@ impl<'a> OwningPtr<'a> {
     }
 }
 
-/// Conceptually equivalent to `&'a [T]` but with length information cut out for performance reasons
-pub struct ThinSlicePtr<'a, T> {
+/// Conceptually equivalent to `&'a [T]` but with length information cut out for performance reasons.
+/// The slice itself is aligned to at least `MAX_SIMD_ALIGNMENT`, however its elements may not be.
+/// Use the `align_to` method to get batches of suitable alignment.
+pub struct ThinSimdAlignedSlicePtr<'a, T> {
     ptr: NonNull<T>,
     #[cfg(debug_assertions)]
     len: usize,
     _marker: PhantomData<&'a [T]>,
 }
 
-impl<'a, T> ThinSlicePtr<'a, T> {
+impl<'a, T> ThinSimdAlignedSlicePtr<'a, T> {
+    /// # Safety
+    /// The contents of the slice returned by this function must never be accessed
+    #[inline]
+    pub unsafe fn dangling() -> Self {
+        let item_layout = core::alloc::Layout::new::<T>();
+
+        let dangling =
+            NonNull::new(item_layout.align().max(batch::MAX_SIMD_ALIGNMENT) as *mut T).unwrap();
+
+        Self {
+            ptr: dangling,
+            #[cfg(debug_assertions)]
+            len: 0,
+            _marker: PhantomData,
+        }
+    }
+
+    /// # Safety
+    /// `ptr` must be non-null
+    /// `ptr` must be aligned to at least `MAX_SIMD_ALIGNMENT`
     #[inline]
+    pub unsafe fn new(ptr: *mut T, _len: usize) -> Self {
+        //NOTE: ZSTs may cause this "slice" to point into nothingness.
+        //This sounds dangerous, but won't cause harm as nothing
+        //will actually access anything "in the slice"
+
+        //TODO: when pointer_is_aligned is standardized, we can just use ptr::is_aligned()
+        #[cfg(debug_assertions)]
+        debug_assert_eq!(ptr as usize % batch::MAX_SIMD_ALIGNMENT, 0);
+
+        Self {
+            ptr: NonNull::new_unchecked(ptr),
+            #[cfg(debug_assertions)]
+            len: _len,
+            _marker: PhantomData,
+        }
+    }
+
     /// Indexes the slice without doing bounds checks
     ///
     /// # Safety
     /// `index` must be in-bounds.
+    #[inline]
     pub unsafe fn get(self, index: usize) -> &'a T {
         #[cfg(debug_assertions)]
         debug_assert!(index < self.len);
 
         &*self.ptr.as_ptr().add(index)
     }
+
+    /// # Safety
+    /// `index` must be in bounds
+    /// `index + len` must be in bounds
+    #[inline]
+    pub unsafe fn get_slice(self, index: usize, len: usize) -> &'a [T] {
+        core::slice::from_raw_parts(self.ptr.as_ptr().add(index), len)
+    }
 }
 
-impl<'a, T> Clone for ThinSlicePtr<'a, T> {
+impl<'a, T> Clone for ThinSimdAlignedSlicePtr<'a, T> {
     fn clone(&self) -> Self {
         Self {
             ptr: self.ptr,
@@ -228,16 +288,17 @@ impl<'a, T> Clone for ThinSlicePtr<'a, T> {
     }
 }
 
-impl<'a, T> Copy for ThinSlicePtr<'a, T> {}
+impl<'a, T> Copy for ThinSimdAlignedSlicePtr<'a, T> {}
 
-impl<'a, T> From<&'a [T]> for ThinSlicePtr<'a, T> {
+//Helpers for the UnsafeCell cases
+impl<'a, T> ThinSimdAlignedSlicePtr<'a, UnsafeCell<T>> {
+    /// Get an immutable view of this `ThinSimdAlignedSlicePtr`'s contents.  Note that this is not a reference type.
     #[inline]
-    fn from(slice: &'a [T]) -> Self {
-        Self {
-            // SAFETY: a reference can never be null
-            ptr: unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut T) },
+    pub fn as_deref(self) -> ThinSimdAlignedSlicePtr<'a, T> {
+        ThinSimdAlignedSlicePtr::<'a, T> {
+            ptr: self.ptr.cast::<T>(),
             #[cfg(debug_assertions)]
-            len: slice.len(),
+            len: self.len,
             _marker: PhantomData,
         }
     }