From 9684ebf1bf80b9eb8aa2924da6b9748fb96d6750 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Sat, 11 Sep 2021 16:36:35 +0100
Subject: [PATCH 1/5] fill_via_chunks: use safe code via chunks_exact_mut on BE

This (specifically, using chunks_exact_mut) actually
improves performance substantially.
---
 rand_core/src/impls.rs | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/rand_core/src/impls.rs b/rand_core/src/impls.rs
index 2588a72ea3f..f16f5b5b8d2 100644
--- a/rand_core/src/impls.rs
+++ b/rand_core/src/impls.rs
@@ -58,9 +58,8 @@ macro_rules! fill_via_chunks {
         let chunk_size_u8 = min($src.len() * SIZE, $dst.len());
         let chunk_size = (chunk_size_u8 + SIZE - 1) / SIZE;
 
-        // The following can be replaced with safe code, but unfortunately it's
-        // ca. 8% slower.
         if cfg!(target_endian = "little") {
+            // On LE we can do a simple copy, which is 25-50% faster:
             unsafe {
                 core::ptr::copy_nonoverlapping(
                     $src.as_ptr() as *const u8,
@@ -68,15 +67,16 @@ macro_rules! fill_via_chunks {
                     chunk_size_u8);
             }
         } else {
-            for (&n, chunk) in $src.iter().zip($dst.chunks_mut(SIZE)) {
-                let tmp = n.to_le();
-                let src_ptr = &tmp as *const $ty as *const u8;
-                unsafe {
-                    core::ptr::copy_nonoverlapping(
-                        src_ptr,
-                        chunk.as_mut_ptr(),
-                        chunk.len());
-                }
+            // This code is valid on all arches, but slower than the above:
+            let mut i = 0;
+            let mut iter = $dst[..chunk_size_u8].chunks_exact_mut(SIZE);
+            while let Some(chunk) = iter.next() {
+                chunk.copy_from_slice(&$src[i].to_le_bytes());
+                i += 1;
+            }
+            let chunk = iter.into_remainder();
+            if !chunk.is_empty() {
+                chunk.copy_from_slice(&$src[i].to_le_bytes()[..chunk.len()]);
             }
         }
 

From 9972046a118844b9d9ea759e292de08f26761b7d Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Mon, 13 Sep 2021 09:30:14 +0100
Subject: [PATCH 2/5] fill_via_chunks: make a generic function

---
 rand_core/src/impls.rs | 76 +++++++++++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 30 deletions(-)

diff --git a/rand_core/src/impls.rs b/rand_core/src/impls.rs
index f16f5b5b8d2..bd21b7c731e 100644
--- a/rand_core/src/impls.rs
+++ b/rand_core/src/impls.rs
@@ -52,36 +52,52 @@ pub fn fill_bytes_via_next<R: RngCore + ?Sized>(rng: &mut R, dest: &mut [u8]) {
     }
 }
 
-macro_rules! fill_via_chunks {
-    ($src:expr, $dst:expr, $ty:ty) => {{
-        const SIZE: usize = core::mem::size_of::<$ty>();
-        let chunk_size_u8 = min($src.len() * SIZE, $dst.len());
-        let chunk_size = (chunk_size_u8 + SIZE - 1) / SIZE;
-
-        if cfg!(target_endian = "little") {
-            // On LE we can do a simple copy, which is 25-50% faster:
-            unsafe {
-                core::ptr::copy_nonoverlapping(
-                    $src.as_ptr() as *const u8,
-                    $dst.as_mut_ptr(),
-                    chunk_size_u8);
-            }
-        } else {
-            // This code is valid on all arches, but slower than the above:
-            let mut i = 0;
-            let mut iter = $dst[..chunk_size_u8].chunks_exact_mut(SIZE);
-            while let Some(chunk) = iter.next() {
-                chunk.copy_from_slice(&$src[i].to_le_bytes());
-                i += 1;
-            }
-            let chunk = iter.into_remainder();
-            if !chunk.is_empty() {
-                chunk.copy_from_slice(&$src[i].to_le_bytes()[..chunk.len()]);
-            }
+trait ToLe: Copy {
+    type Bytes: AsRef<[u8]>;
+    fn to_le_bytes(self) -> Self::Bytes;
+}
+impl ToLe for u32 {
+    type Bytes = [u8; 4];
+    fn to_le_bytes(self) -> Self::Bytes {
+        self.to_le_bytes()
+    }
+}
+impl ToLe for u64 {
+    type Bytes = [u8; 8];
+    fn to_le_bytes(self) -> Self::Bytes {
+        self.to_le_bytes()
+    }
+}
+
+fn fill_via_chunks<T: ToLe>(src: &[T], dest: &mut [u8]) -> (usize, usize) {
+    let size = core::mem::size_of::<T>();
+    let chunk_size_u8 = min(src.len() * size, dest.len());
+    let chunk_size = (chunk_size_u8 + size - 1) / size;
+
+    if cfg!(target_endian = "little") {
+        // On LE we can do a simple copy, which is 25-50% faster:
+        unsafe {
+            core::ptr::copy_nonoverlapping(
+                src.as_ptr() as *const u8,
+                dest.as_mut_ptr(),
+                chunk_size_u8,
+            );
+        }
+    } else {
+        // This code is valid on all arches, but slower than the above:
+        let mut i = 0;
+        let mut iter = dest[..chunk_size_u8].chunks_exact_mut(size);
+        while let Some(chunk) = iter.next() {
+            chunk.copy_from_slice(src[i].to_le_bytes().as_ref());
+            i += 1;
         }
+        let chunk = iter.into_remainder();
+        if !chunk.is_empty() {
+            chunk.copy_from_slice(&src[i].to_le_bytes().as_ref()[..chunk.len()]);
+        }
+    }
 
-        (chunk_size, chunk_size_u8)
-    }};
+    (chunk_size, chunk_size_u8)
 }
 
 /// Implement `fill_bytes` by reading chunks from the output buffer of a block
@@ -115,7 +131,7 @@ macro_rules! fill_via_chunks {
 /// }
 /// ```
 pub fn fill_via_u32_chunks(src: &[u32], dest: &mut [u8]) -> (usize, usize) {
-    fill_via_chunks!(src, dest, u32)
+    fill_via_chunks(src, dest)
 }
 
 /// Implement `fill_bytes` by reading chunks from the output buffer of a block
@@ -129,7 +145,7 @@ pub fn fill_via_u32_chunks(src: &[u32], dest: &mut [u8]) -> (usize, usize) {
 ///
 /// See `fill_via_u32_chunks` for an example.
 pub fn fill_via_u64_chunks(src: &[u64], dest: &mut [u8]) -> (usize, usize) {
-    fill_via_chunks!(src, dest, u64)
+    fill_via_chunks(src, dest)
 }
 
 /// Implement `next_u32` via `fill_bytes`, little-endian order.

From 93ade1a009b66476491f0521a56e5dd97a4791bc Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Tue, 14 Sep 2021 08:11:53 +0100
Subject: [PATCH 3/5] fill_via_chunks: better value names

---
 rand_core/src/impls.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rand_core/src/impls.rs b/rand_core/src/impls.rs
index bd21b7c731e..7c09b5488ac 100644
--- a/rand_core/src/impls.rs
+++ b/rand_core/src/impls.rs
@@ -71,8 +71,8 @@ impl ToLe for u64 {
 
 fn fill_via_chunks<T: ToLe>(src: &[T], dest: &mut [u8]) -> (usize, usize) {
     let size = core::mem::size_of::<T>();
-    let chunk_size_u8 = min(src.len() * size, dest.len());
-    let chunk_size = (chunk_size_u8 + size - 1) / size;
+    let byte_len = min(src.len() * size, dest.len());
+    let num_chunks = (byte_len + size - 1) / size;
 
     if cfg!(target_endian = "little") {
         // On LE we can do a simple copy, which is 25-50% faster:
@@ -80,13 +80,13 @@ fn fill_via_chunks<T: ToLe>(src: &[T], dest: &mut [u8]) -> (usize, usize) {
             core::ptr::copy_nonoverlapping(
                 src.as_ptr() as *const u8,
                 dest.as_mut_ptr(),
-                chunk_size_u8,
+                byte_len,
             );
         }
     } else {
         // This code is valid on all arches, but slower than the above:
         let mut i = 0;
-        let mut iter = dest[..chunk_size_u8].chunks_exact_mut(size);
+        let mut iter = dest[..byte_len].chunks_exact_mut(size);
         while let Some(chunk) = iter.next() {
             chunk.copy_from_slice(src[i].to_le_bytes().as_ref());
             i += 1;
@@ -97,7 +97,7 @@ fn fill_via_chunks<T: ToLe>(src: &[T], dest: &mut [u8]) -> (usize, usize) {
         }
     }
 
-    (chunk_size, chunk_size_u8)
+    (num_chunks, byte_len)
 }
 
 /// Implement `fill_bytes` by reading chunks from the output buffer of a block

From 19b7a76481c23497aff33fe7024cac4564d5196e Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Tue, 14 Sep 2021 08:17:26 +0100
Subject: [PATCH 4/5] fill_via_chunks: make ToLe an unsafe trait

---
 rand_core/src/impls.rs | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/rand_core/src/impls.rs b/rand_core/src/impls.rs
index 7c09b5488ac..1105094221e 100644
--- a/rand_core/src/impls.rs
+++ b/rand_core/src/impls.rs
@@ -52,17 +52,19 @@ pub fn fill_bytes_via_next<R: RngCore + ?Sized>(rng: &mut R, dest: &mut [u8]) {
     }
 }
 
-trait ToLe: Copy {
+/// Contract: implementing type must be memory-safe to observe as a byte array
+/// (implies no uninitialised padding).
+unsafe trait ToLe: Copy {
     type Bytes: AsRef<[u8]>;
     fn to_le_bytes(self) -> Self::Bytes;
 }
-impl ToLe for u32 {
+unsafe impl ToLe for u32 {
     type Bytes = [u8; 4];
     fn to_le_bytes(self) -> Self::Bytes {
         self.to_le_bytes()
     }
 }
-impl ToLe for u64 {
+unsafe impl ToLe for u64 {
     type Bytes = [u8; 8];
     fn to_le_bytes(self) -> Self::Bytes {
         self.to_le_bytes()

From 34a8f13d863a79be9d41f2ef63dc3b85b156c9dc Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Wed, 15 Sep 2021 09:27:11 +0100
Subject: [PATCH 5/5] Replace ToLe with Observable, including as_byte_slice
 method

---
 rand_core/src/impls.rs | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/rand_core/src/impls.rs b/rand_core/src/impls.rs
index 1105094221e..12838dd138d 100644
--- a/rand_core/src/impls.rs
+++ b/rand_core/src/impls.rs
@@ -52,39 +52,44 @@ pub fn fill_bytes_via_next<R: RngCore + ?Sized>(rng: &mut R, dest: &mut [u8]) {
     }
 }
 
-/// Contract: implementing type must be memory-safe to observe as a byte array
-/// (implies no uninitialised padding).
-unsafe trait ToLe: Copy {
+trait Observable: Copy {
     type Bytes: AsRef<[u8]>;
     fn to_le_bytes(self) -> Self::Bytes;
+
+    // Contract: observing self is memory-safe (implies no uninitialised padding)
+    fn as_byte_slice(x: &[Self]) -> &[u8];
 }
-unsafe impl ToLe for u32 {
+impl Observable for u32 {
     type Bytes = [u8; 4];
     fn to_le_bytes(self) -> Self::Bytes {
         self.to_le_bytes()
     }
+    fn as_byte_slice(x: &[Self]) -> &[u8] {
+        let ptr = x.as_ptr() as *const u8;
+        let len = x.len() * core::mem::size_of::<Self>();
+        unsafe { core::slice::from_raw_parts(ptr, len) }
+    }
 }
-unsafe impl ToLe for u64 {
+impl Observable for u64 {
     type Bytes = [u8; 8];
     fn to_le_bytes(self) -> Self::Bytes {
         self.to_le_bytes()
     }
+    fn as_byte_slice(x: &[Self]) -> &[u8] {
+        let ptr = x.as_ptr() as *const u8;
+        let len = x.len() * core::mem::size_of::<Self>();
+        unsafe { core::slice::from_raw_parts(ptr, len) }
+    }
 }
 
-fn fill_via_chunks<T: ToLe>(src: &[T], dest: &mut [u8]) -> (usize, usize) {
+fn fill_via_chunks<T: Observable>(src: &[T], dest: &mut [u8]) -> (usize, usize) {
     let size = core::mem::size_of::<T>();
     let byte_len = min(src.len() * size, dest.len());
     let num_chunks = (byte_len + size - 1) / size;
 
     if cfg!(target_endian = "little") {
         // On LE we can do a simple copy, which is 25-50% faster:
-        unsafe {
-            core::ptr::copy_nonoverlapping(
-                src.as_ptr() as *const u8,
-                dest.as_mut_ptr(),
-                byte_len,
-            );
-        }
+        dest[..byte_len].copy_from_slice(&T::as_byte_slice(&src[..num_chunks])[..byte_len]);
     } else {
         // This code is valid on all arches, but slower than the above:
         let mut i = 0;