From 647371e8cda2475a081825cf5c3fcb5379b51585 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Fri, 29 Jun 2018 11:01:56 +0100
Subject: [PATCH 01/14] sample_indices: revise benchmarks (pre-optimisation)

---
 benches/seq.rs | 56 ++++++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/benches/seq.rs b/benches/seq.rs
index 260e2334a41..d844ab7f599 100644
--- a/benches/seq.rs
+++ b/benches/seq.rs
@@ -1,4 +1,5 @@
 #![feature(test)]
+#![allow(non_snake_case)]
 
 extern crate test;
 extern crate rand;
@@ -27,28 +28,31 @@ fn seq_slice_choose_1_of_1000(b: &mut Bencher) {
     })
 }
 
-#[bench]
-fn seq_slice_choose_multiple_1_of_1000(b: &mut Bencher) {
-    let mut rng = SmallRng::from_rng(thread_rng()).unwrap();
-    let x : &[usize] = &[1; 1000];
-    b.iter(|| {
-        x.choose_multiple(&mut rng, 1).cloned().next()
-    })
-}
-
-#[bench]
-fn seq_slice_choose_multiple_10_of_100(b: &mut Bencher) {
-    let mut rng = SmallRng::from_rng(thread_rng()).unwrap();
-    let x : &[usize] = &[1; 100];
-    let mut buf = [0; 10];
-    b.iter(|| {
-        for (v, slot) in x.choose_multiple(&mut rng, buf.len()).zip(buf.iter_mut()) {
-            *slot = *v;
+macro_rules! seq_slice_choose_multiple {
+    ($name:ident, $amount:expr, $length:expr) => {
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            let mut rng = SmallRng::from_rng(thread_rng()).unwrap();
+            let x : &[i32] = &[$amount; $length];
+            let mut result = [0i32; $amount];
+            b.iter(|| {
+                // Collect full result to prevent unwanted shortcuts getting
+                // first element (in case sample_indices returns an iterator).
+                for (slot, sample) in result.iter_mut().zip(
+                    x.choose_multiple(&mut rng, $amount)) {
+                    *slot = *sample;
+                }
+                result[$amount-1]
+            })
         }
-        buf
-    })
+    }
 }
 
+seq_slice_choose_multiple!(seq_slice_choose_multiple_1_of_1000, 1, 1000);
+seq_slice_choose_multiple!(seq_slice_choose_multiple_950_of_1000, 950, 1000);
+seq_slice_choose_multiple!(seq_slice_choose_multiple_10_of_100, 10, 100);
+seq_slice_choose_multiple!(seq_slice_choose_multiple_90_of_100, 90, 100);
+
 #[bench]
 fn seq_iter_choose_from_100(b: &mut Bencher) {
     let mut rng = SmallRng::from_rng(thread_rng()).unwrap();
@@ -78,17 +82,21 @@ fn seq_iter_choose_multiple_fill_10_of_100(b: &mut Bencher) {
 }
 
 macro_rules! sample_indices {
-    ($name:ident, $amount:expr, $length:expr) => {
+    ($name:ident, $fn:ident, $amount:expr, $length:expr) => {
         #[bench]
         fn $name(b: &mut Bencher) {
             let mut rng = SmallRng::from_rng(thread_rng()).unwrap();
             b.iter(|| {
-                sample_indices(&mut rng, $length, $amount)
+                $fn(&mut rng, $length, $amount)
             })
         }
     }
 }
 
-sample_indices!(seq_sample_indices_10_of_1k, 10, 1000);
-sample_indices!(seq_sample_indices_50_of_1k, 50, 1000);
-sample_indices!(seq_sample_indices_100_of_1k, 100, 1000);
+sample_indices!(misc_sample_indices_1_of_1k, sample_indices, 1, 1000);
+sample_indices!(misc_sample_indices_10_of_1k, sample_indices, 10, 1000);
+sample_indices!(misc_sample_indices_100_of_1k, sample_indices, 100, 1000);
+sample_indices!(misc_sample_indices_100_of_1M, sample_indices, 100, 1000_000);
+sample_indices!(misc_sample_indices_100_of_1G, sample_indices, 100, 1000_000_000);
+sample_indices!(misc_sample_indices_400_of_1G, sample_indices, 400, 1000_000_000);
+sample_indices!(misc_sample_indices_600_of_1G, sample_indices, 600, 1000_000_000);

From 28d4949006a0960b41d223a144e9f84cd7d01f53 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Thu, 7 Jun 2018 12:43:34 +0100
Subject: [PATCH 02/14] sample_indices: refactor; switch _inplaces to u32 only

The sample_indices_inplace algorithm is inappropriate for large numbers
---
 src/seq.rs | 95 +++++++++++++++++++++++++++---------------------------
 1 file changed, 47 insertions(+), 48 deletions(-)

diff --git a/src/seq.rs b/src/seq.rs
index e030712b3d1..70471e1be32 100644
--- a/src/seq.rs
+++ b/src/seq.rs
@@ -522,76 +522,78 @@ pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize) -> Vec<usize
     // and a trade off could probably be made between memory/cpu, since hashmap operations
     // are slower than array index swapping.
     if amount >= length / 20 {
-        sample_indices_inplace(rng, length, amount)
+        sample_indices_inplace(rng, length as u32, amount as u32)
+            .into_iter().map(|x| x as usize).collect()
     } else {
         sample_indices_cache(rng, length, amount)
     }
 }
 
-/// Sample an amount of indices using an inplace partial fisher yates method.
+/// Randomly sample exactly `amount` indices from `0..length`, using an inplace
+/// partial Fisher-Yates method.
 ///
 /// This allocates the entire `length` of indices and randomizes only the first `amount`.
 /// It then truncates to `amount` and returns.
+/// 
+/// This method is not appropriate for large `length` and potentially uses a lot
+/// of memory; because of this we only implement for `u32` index (which improves
+/// performance in all cases).
 ///
-/// This is better than using a `HashMap` "cache" when `amount >= length / 2`
-/// since it does not require allocating an extra cache and is much faster.
+/// This is likely the fastest for small lengths since it avoids the need for
+/// allocations. Set-up is `O(length)` time and memory and shuffling is
+/// `O(amount)` time.
 #[cfg(feature = "alloc")]
-fn sample_indices_inplace<R>(rng: &mut R, length: usize, amount: usize) -> Vec<usize>
+fn sample_indices_inplace<R>(rng: &mut R, length: u32, amount: u32)
+    -> Vec<u32>
     where R: Rng + ?Sized,
 {
     debug_assert!(amount <= length);
-    let mut indices: Vec<usize> = Vec::with_capacity(length);
+    let mut indices: Vec<u32> = Vec::with_capacity(length as usize);
     indices.extend(0..length);
     for i in 0..amount {
-        let j: usize = rng.gen_range(i, length);
-        indices.swap(i, j);
+        let j: u32 = rng.gen_range(i, length);
+        indices.swap(i as usize, j as usize);
     }
-    indices.truncate(amount);
-    debug_assert_eq!(indices.len(), amount);
+    indices.truncate(amount as usize);
+    debug_assert_eq!(indices.len(), amount as usize);
     indices
 }
 
-
-/// This method performs a partial fisher-yates on a range of indices using a
-/// `HashMap` as a cache to record potential collisions.
+/// Randomly sample exactly `amount` indices from `0..length`, using a
+/// dynamically-cached partial Fisher-Yates method.
 ///
-/// The cache avoids allocating the entire `length` of values. This is especially useful when
-/// `amount <<< length`, i.e. select 3 non-repeating from `1_000_000`
+/// The cache avoids allocating the entire `length` of values. This is
+/// especially useful when `amount <<< length`; e.g. selecting 3 non-repeating
+/// values from `1_000_000`. The algorithm is `O(amount)` time and memory,
+/// but due to overheads will often be slower than other approaches.
 #[cfg(feature = "alloc")]
-fn sample_indices_cache<R>(
-    rng: &mut R,
-    length: usize,
-    amount: usize,
-) -> Vec<usize>
+fn sample_indices_cache<R>(rng: &mut R, length: usize, amount: usize)
+    -> Vec<usize>
     where R: Rng + ?Sized,
 {
     debug_assert!(amount <= length);
     #[cfg(feature="std")] let mut cache = HashMap::with_capacity(amount);
     #[cfg(not(feature="std"))] let mut cache = BTreeMap::new();
-    let mut out = Vec::with_capacity(amount);
+    let mut indices = Vec::with_capacity(amount);
     for i in 0..amount {
         let j: usize = rng.gen_range(i, length);
 
-        // equiv: let tmp = slice[i];
-        let tmp = match cache.get(&i) {
-            Some(e) => *e,
+        // get the current values at i and j ...
+        let x_i = match cache.get(&i) {
+            Some(x) => *x,
             None => i,
         };
-
-        // equiv: slice[i] = slice[j];
-        let x = match cache.get(&j) {
+        let x_j = match cache.get(&j) {
             Some(x) => *x,
             None => j,
         };
 
-        // equiv: slice[j] = tmp;
-        cache.insert(j, tmp);
-
-        // note that in the inplace version, slice[i] is automatically "returned" value
-        out.push(x);
+        // ... and swap them
+        cache.insert(j, x_i);
+        indices.push(x_j);  // push at position i
     }
-    debug_assert_eq!(out.len(), amount);
-    out
+    debug_assert_eq!(indices.len(), amount);
+    indices
 }
 
 #[cfg(test)]
@@ -752,14 +754,19 @@ mod test {
         let v = sample_slice(&mut r, &[42, 133], 2);
         assert!(&v[..] == [42, 133] || v[..] == [133, 42]);
 
-        assert_eq!(&sample_indices_inplace(&mut r, 0, 0)[..], [0usize; 0]);
-        assert_eq!(&sample_indices_inplace(&mut r, 1, 0)[..], [0usize; 0]);
+        assert_eq!(&sample_indices_inplace(&mut r, 0, 0)[..], [0; 0]);
+        assert_eq!(&sample_indices_inplace(&mut r, 1, 0)[..], [0; 0]);
         assert_eq!(&sample_indices_inplace(&mut r, 1, 1)[..], [0]);
 
-        assert_eq!(&sample_indices_cache(&mut r, 0, 0)[..], [0usize; 0]);
-        assert_eq!(&sample_indices_cache(&mut r, 1, 0)[..], [0usize; 0]);
+        assert_eq!(&sample_indices_cache(&mut r, 0, 0)[..], [0; 0]);
+        assert_eq!(&sample_indices_cache(&mut r, 1, 0)[..], [0; 0]);
         assert_eq!(&sample_indices_cache(&mut r, 1, 1)[..], [0]);
 
+        // These algorithms should be fast with big numbers. Test average.
+        let sum = sample_indices_cache(&mut r, 1 << 50, 10)
+            .iter().fold(0, |a, b| a + b);
+        assert!(1 << 50 < sum && sum < (1 << 50) * 25);
+        
         // Make sure lucky 777's aren't lucky
         let slice = &[42, 777];
         let mut num_42 = 0;
@@ -783,27 +790,19 @@ mod test {
     fn test_sample_slice() {
         let xor_rng = XorShiftRng::from_seed;
 
-        let max_range = 100;
         let mut r = ::test::rng(403);
 
-        for length in 1usize..max_range {
+        for n in 1usize..20 {
+            let length = 5*n - 4;   // 1, 6, ...
             let amount = r.gen_range(0, length);
             let mut seed = [0u8; 16];
             r.fill(&mut seed);
 
-            // assert that the two index methods give exactly the same result
-            let inplace = sample_indices_inplace(
-                &mut xor_rng(seed), length, amount);
-            let cache = sample_indices_cache(
-                &mut xor_rng(seed), length, amount);
-            assert_eq!(inplace, cache);
-
             // assert the basics work
             let regular = sample_indices(
                 &mut xor_rng(seed), length, amount);
             assert_eq!(regular.len(), amount);
             assert!(regular.iter().all(|e| *e < length));
-            assert_eq!(regular, inplace);
 
             // also test that sampling the slice works
             let vec: Vec<usize> = (0..length).collect();

From 949833ddd6b023f61d6f6a64d4b630d24b3792ba Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Fri, 29 Jun 2018 11:02:58 +0100
Subject: [PATCH 03/14] sample_indices: add Floyd's algorithm and update
 selection logic

---
 src/seq.rs | 126 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 105 insertions(+), 21 deletions(-)

diff --git a/src/seq.rs b/src/seq.rs
index 70471e1be32..47e5171f320 100644
--- a/src/seq.rs
+++ b/src/seq.rs
@@ -495,12 +495,16 @@ pub fn sample_slice_ref<'a, R, T>(rng: &mut R, slice: &'a [T], amount: usize) ->
 ///
 /// The values are non-repeating and in random order.
 ///
-/// This implementation uses `O(amount)` time and memory.
+/// This method is used internally by the slice sampling methods, but it can
+/// sometimes be useful to have the indices themselves so this is provided as
+/// an alternative.
 ///
-/// This method is used internally by the slice sampling methods, but it can sometimes be useful to
-/// have the indices themselves so this is provided as an alternative.
+/// The implementation used is not specified; we automatically select the
+/// fastest available implementation. Roughly speaking, complexity is
+/// `O(amount)` if `amount` is small relative to `length`, otherwise `O(length)`.
 ///
-/// Panics if `amount > length`
+/// Panics if `amount > length`; may panic with extremely large `amount` or
+/// `length` (when `36*length` or `2720*amount` overflows `usize`).
 #[cfg(feature = "alloc")]
 pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize) -> Vec<usize>
     where R: Rng + ?Sized,
@@ -508,29 +512,58 @@ pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize) -> Vec<usize
     if amount > length {
         panic!("`amount` must be less than or equal to `slice.len()`");
     }
-
-    // We are going to have to allocate at least `amount` for the output no matter what. However,
-    // if we use the `cached` version we will have to allocate `amount` as a HashMap as well since
-    // it inserts an element for every loop.
-    //
-    // Therefore, if `amount >= length / 2` then inplace will be both faster and use less memory.
-    // In fact, benchmarks show the inplace version is faster for length up to about 20 times
-    // faster than amount.
-    //
-    // TODO: there is probably even more fine-tuning that can be done here since
-    // `HashMap::with_capacity(amount)` probably allocates more than `amount` in practice,
-    // and a trade off could probably be made between memory/cpu, since hashmap operations
-    // are slower than array index swapping.
-    if amount >= length / 20 {
-        sample_indices_inplace(rng, length as u32, amount as u32)
-            .into_iter().map(|x| x as usize).collect()
+    
+    // Choice of algorithm here depends on both length and amount. See:
+    // https://github.com/rust-lang-nursery/rand/pull/479
+
+    if amount < 517 {
+        const C: [[usize; 2]; 2] = [[1, 36], [200, 440]];
+        let j = if length < 500_000 { 0 } else { 1 };
+        let m4 = 4 * amount;
+        if C[0][j] * length < (C[1][j] + m4) * amount {
+            sample_indices_inplace(rng, length as u32, amount as u32)
+                .into_iter()
+                .map(|x| x as usize)
+                .collect()
+        } else {
+            sample_indices_floyd(rng, length, amount)
+        }
     } else {
-        sample_indices_cache(rng, length, amount)
+        const C: [[usize; 2]; 2] = [[1, 36], [62*40, 68*40]];
+        let j = if length < 500_000 { 0 } else { 1 };
+        if C[0][j] * length < C[1][j] * amount {
+            sample_indices_inplace(rng, length as u32, amount as u32)
+                .into_iter()
+                .map(|x| x as usize)
+                .collect()
+        } else {
+            sample_indices_cache(rng, length, amount)
+        }
+    }
+}
+
+/// Randomly sample exactly `amount` indices from `0..length`, using Floyd's
+/// combination algorithm.
+///
+/// This implementation uses `O(amount)` memory and `O(amount^2)` time.
+#[cfg(feature = "alloc")]
+fn sample_indices_floyd<R>(rng: &mut R, length: usize, amount: usize)
+    -> Vec<usize>
+    where R: Rng + ?Sized,
+{
+    debug_assert!(amount <= length);
+    let mut indices = Vec::with_capacity(amount);
+    for j in length - amount .. length {
+        let t = rng.gen_range(0, j + 1);
+        let t = if indices.contains(&t) { j } else { t };
+        indices.push( t );
     }
+    indices
 }
 
 /// Randomly sample exactly `amount` indices from `0..length`, using an inplace
 /// partial Fisher-Yates method.
+/// Sample an amount of indices using an inplace partial fisher yates method.
 ///
 /// This allocates the entire `length` of indices and randomizes only the first `amount`.
 /// It then truncates to `amount` and returns.
@@ -762,11 +795,19 @@ mod test {
         assert_eq!(&sample_indices_cache(&mut r, 1, 0)[..], [0; 0]);
         assert_eq!(&sample_indices_cache(&mut r, 1, 1)[..], [0]);
 
+        assert_eq!(&sample_indices_floyd(&mut r, 0, 0)[..], [0; 0]);
+        assert_eq!(&sample_indices_floyd(&mut r, 1, 0)[..], [0; 0]);
+        assert_eq!(&sample_indices_floyd(&mut r, 1, 1)[..], [0]);
+        
         // These algorithms should be fast with big numbers. Test average.
         let sum = sample_indices_cache(&mut r, 1 << 50, 10)
             .iter().fold(0, |a, b| a + b);
         assert!(1 << 50 < sum && sum < (1 << 50) * 25);
         
+        let sum = sample_indices_floyd(&mut r, 1 << 50, 10)
+            .iter().fold(0, |a, b| a + b);
+        assert!(1 << 50 < sum && sum < (1 << 50) * 25);
+
         // Make sure lucky 777's aren't lucky
         let slice = &[42, 777];
         let mut num_42 = 0;
@@ -818,7 +859,50 @@ mod test {
             }
         }
     }
+    
+    #[test]
+    #[cfg(feature = "alloc")]
+    fn test_sample_alg() {
+        let xor_rng = XorShiftRng::from_seed;
 
+        let mut r = ::test::rng(403);
+        let mut seed = [0u8; 16];
+        
+        // We can't test which algorithm is used directly, but each should
+        // produce a different sample with the same parameters.
+        
+        // A small length and relatively large amount should use inplace
+        r.fill(&mut seed);
+        let (length, amount) = (100, 50);
+        let v1 = sample_indices(&mut xor_rng(seed), length, amount);
+        let v2 = sample_indices_inplace(&mut xor_rng(seed),
+            length as u32, amount as u32);
+        assert!(v1.iter().all(|e| *e < length));
+        assert!(v1.iter().zip(v2.iter()).all(|(x,y)| *x == *y as usize));
+        
+        // Test other algs do produce different results
+        let v3 = sample_indices_floyd(&mut xor_rng(seed), length, amount);
+        let v4 = sample_indices_cache(&mut xor_rng(seed), length, amount);
+        assert!(v1 != v3);
+        assert!(v1 != v4);
+        
+        // A large length and small amount should use Floyd
+        r.fill(&mut seed);
+        let (length, amount) = (1<<20, 50);
+        let v1 = sample_indices(&mut xor_rng(seed), length, amount);
+        let v2 = sample_indices_floyd(&mut xor_rng(seed), length, amount);
+        assert!(v1.iter().all(|e| *e < length));
+        assert_eq!(v1, v2);
+        
+        // A large length and larger amount should use cache
+        r.fill(&mut seed);
+        let (length, amount) = (1<<20, 600);
+        let v1 = sample_indices(&mut xor_rng(seed), length, amount);
+        let v2 = sample_indices_cache(&mut xor_rng(seed), length, amount);
+        assert!(v1.iter().all(|e| *e < length));
+        assert_eq!(v1, v2);
+    }
+    
     #[test]
     #[cfg(feature = "alloc")]
     fn test_weighted() {

From 2f5a03a7f07919b701e7f1126a2820cf15312136 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Sat, 2 Jun 2018 10:51:30 +0100
Subject: [PATCH 04/14] sample_indices: use u32 internally only

(controversial)
---
 src/seq.rs | 123 +++++++++++++++++++++++++++--------------------------
 1 file changed, 62 insertions(+), 61 deletions(-)

diff --git a/src/seq.rs b/src/seq.rs
index 47e5171f320..2fd2dd2b443 100644
--- a/src/seq.rs
+++ b/src/seq.rs
@@ -396,7 +396,7 @@ impl<I> IteratorRandom for I where I: Iterator + Sized {}
 pub struct SliceChooseIter<'a, S: ?Sized + 'a, T: 'a> {
     slice: &'a S,
     _phantom: ::core::marker::PhantomData<T>,
-    indices: vec::IntoIter<usize>,
+    indices: vec::IntoIter<u32>,
 }
 
 #[cfg(feature = "alloc")]
@@ -405,7 +405,7 @@ impl<'a, S: Index<usize, Output = T> + ?Sized + 'a, T: 'a> Iterator for SliceCho
 
     fn next(&mut self) -> Option<Self::Item> {
         // TODO: investigate using SliceIndex::get_unchecked when stable
-        self.indices.next().map(|i| &(*self.slice)[i])
+        self.indices.next().map(|i| &(*self.slice)[i as usize])
     }
     
     fn size_hint(&self) -> (usize, Option<usize>) {
@@ -464,7 +464,7 @@ pub fn sample_slice<R, T>(rng: &mut R, slice: &[T], amount: usize) -> Vec<T>
     let indices = sample_indices(rng, slice.len(), amount);
 
     let mut out = Vec::with_capacity(amount);
-    out.extend(indices.iter().map(|i| slice[*i].clone()));
+    out.extend(indices.iter().map(|i| slice[*i as usize].clone()));
     out
 }
 
@@ -487,7 +487,7 @@ pub fn sample_slice_ref<'a, R, T>(rng: &mut R, slice: &'a [T], amount: usize) ->
     let indices = sample_indices(rng, slice.len(), amount);
 
     let mut out = Vec::with_capacity(amount);
-    out.extend(indices.iter().map(|i| &slice[*i]));
+    out.extend(indices.iter().map(|i| &slice[*i as usize]));
     out
 }
 
@@ -503,39 +503,44 @@ pub fn sample_slice_ref<'a, R, T>(rng: &mut R, slice: &'a [T], amount: usize) ->
 /// fastest available implementation. Roughly speaking, complexity is
 /// `O(amount)` if `amount` is small relative to `length`, otherwise `O(length)`.
 ///
-/// Panics if `amount > length`; may panic with extremely large `amount` or
-/// `length` (when `36*length` or `2720*amount` overflows `usize`).
+/// Note that we only support `u32` indices since this covers the vast majority
+/// of uses, and performance is significantly better than with `u64`.
+/// 
+/// If an allocation-free `no_std` function is required, it is suggested
+/// to adapt the internal `sample_indices_floyd` implementation.
+///
+/// Panics if `amount > length` or if `length` is not reprentable as a `u32`.
 #[cfg(feature = "alloc")]
-pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize) -> Vec<usize>
+pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize) -> Vec<u32>
     where R: Rng + ?Sized,
 {
     if amount > length {
-        panic!("`amount` must be less than or equal to `slice.len()`");
+        panic!("`amount` of samples must be less than or equal to `length`");
     }
+    if length > (::core::u32::MAX as usize) {
+        panic!("`length` is not representable as `u32`");
+    }
+    let amount = amount as u32;
+    let length = length as u32;
     
     // Choice of algorithm here depends on both length and amount. See:
     // https://github.com/rust-lang-nursery/rand/pull/479
+    // We do some calculations with u64 to avoid overflow.
 
     if amount < 517 {
-        const C: [[usize; 2]; 2] = [[1, 36], [200, 440]];
+        const C: [[u64; 2]; 2] = [[1, 36], [200, 440]];
         let j = if length < 500_000 { 0 } else { 1 };
-        let m4 = 4 * amount;
-        if C[0][j] * length < (C[1][j] + m4) * amount {
-            sample_indices_inplace(rng, length as u32, amount as u32)
-                .into_iter()
-                .map(|x| x as usize)
-                .collect()
+        let m4 = 4 * amount as u64;
+        if C[0][j] * (length as u64) < (C[1][j] + m4) * amount as u64 {
+            sample_indices_inplace(rng, length, amount)
         } else {
             sample_indices_floyd(rng, length, amount)
         }
     } else {
-        const C: [[usize; 2]; 2] = [[1, 36], [62*40, 68*40]];
+        const C: [[u64; 2]; 2] = [[1, 36], [62*40, 68*40]];
         let j = if length < 500_000 { 0 } else { 1 };
-        if C[0][j] * length < C[1][j] * amount {
-            sample_indices_inplace(rng, length as u32, amount as u32)
-                .into_iter()
-                .map(|x| x as usize)
-                .collect()
+        if C[0][j] * (length as u64) < C[1][j] * amount as u64 {
+            sample_indices_inplace(rng, length, amount)
         } else {
             sample_indices_cache(rng, length, amount)
         }
@@ -547,12 +552,12 @@ pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize) -> Vec<usize
 ///
 /// This implementation uses `O(amount)` memory and `O(amount^2)` time.
 #[cfg(feature = "alloc")]
-fn sample_indices_floyd<R>(rng: &mut R, length: usize, amount: usize)
-    -> Vec<usize>
+fn sample_indices_floyd<R>(rng: &mut R, length: u32, amount: u32)
+    -> Vec<u32>
     where R: Rng + ?Sized,
 {
     debug_assert!(amount <= length);
-    let mut indices = Vec::with_capacity(amount);
+    let mut indices = Vec::with_capacity(amount as usize);
     for j in length - amount .. length {
         let t = rng.gen_range(0, j + 1);
         let t = if indices.contains(&t) { j } else { t };
@@ -600,16 +605,16 @@ fn sample_indices_inplace<R>(rng: &mut R, length: u32, amount: u32)
 /// values from `1_000_000`. The algorithm is `O(amount)` time and memory,
 /// but due to overheads will often be slower than other approaches.
 #[cfg(feature = "alloc")]
-fn sample_indices_cache<R>(rng: &mut R, length: usize, amount: usize)
-    -> Vec<usize>
+fn sample_indices_cache<R>(rng: &mut R, length: u32, amount: u32)
+    -> Vec<u32>
     where R: Rng + ?Sized,
 {
     debug_assert!(amount <= length);
-    #[cfg(feature="std")] let mut cache = HashMap::with_capacity(amount);
+    #[cfg(feature="std")] let mut cache = HashMap::with_capacity(amount as usize);
     #[cfg(not(feature="std"))] let mut cache = BTreeMap::new();
-    let mut indices = Vec::with_capacity(amount);
+    let mut indices = Vec::with_capacity(amount as usize);
     for i in 0..amount {
-        let j: usize = rng.gen_range(i, length);
+        let j: u32 = rng.gen_range(i, length);
 
         // get the current values at i and j ...
         let x_i = match cache.get(&i) {
@@ -625,7 +630,7 @@ fn sample_indices_cache<R>(rng: &mut R, length: usize, amount: usize)
         cache.insert(j, x_i);
         indices.push(x_j);  // push at position i
     }
-    debug_assert_eq!(indices.len(), amount);
+    debug_assert_eq!(indices.len(), amount as usize);
     indices
 }
 
@@ -800,13 +805,13 @@ mod test {
         assert_eq!(&sample_indices_floyd(&mut r, 1, 1)[..], [0]);
         
         // These algorithms should be fast with big numbers. Test average.
-        let sum = sample_indices_cache(&mut r, 1 << 50, 10)
+        let sum = sample_indices_cache(&mut r, 1 << 25, 10)
             .iter().fold(0, |a, b| a + b);
-        assert!(1 << 50 < sum && sum < (1 << 50) * 25);
+        assert!(1 << 25 < sum && sum < (1 << 25) * 25);
         
-        let sum = sample_indices_floyd(&mut r, 1 << 50, 10)
+        let sum = sample_indices_floyd(&mut r, 1 << 25, 10)
             .iter().fold(0, |a, b| a + b);
-        assert!(1 << 50 < sum && sum < (1 << 50) * 25);
+        assert!(1 << 25 < sum && sum < (1 << 25) * 25);
 
         // Make sure lucky 777's aren't lucky
         let slice = &[42, 777];
@@ -833,7 +838,7 @@ mod test {
 
         let mut r = ::test::rng(403);
 
-        for n in 1usize..20 {
+        for n in 1..20 {
             let length = 5*n - 4;   // 1, 6, ...
             let amount = r.gen_range(0, length);
             let mut seed = [0u8; 16];
@@ -843,20 +848,16 @@ mod test {
             let regular = sample_indices(
                 &mut xor_rng(seed), length, amount);
             assert_eq!(regular.len(), amount);
-            assert!(regular.iter().all(|e| *e < length));
+            assert!(regular.iter().all(|e| *e < length as u32));
 
             // also test that sampling the slice works
-            let vec: Vec<usize> = (0..length).collect();
-            {
-                let result = sample_slice(&mut xor_rng(seed), &vec, amount);
-                assert_eq!(result, regular);
-            }
+            let vec: Vec<u32> = (0..(length as u32)).collect();
+            let result = sample_slice(&mut xor_rng(seed), &vec, amount);
+            assert_eq!(result, regular);
 
-            {
-                let result = sample_slice_ref(&mut xor_rng(seed), &vec, amount);
-                let expected = regular.iter().map(|v| v).collect::<Vec<_>>();
-                assert_eq!(result, expected);
-            }
+            let result = sample_slice_ref(&mut xor_rng(seed), &vec, amount);
+            let expected = regular.iter().map(|v| v).collect::<Vec<_>>();
+            assert_eq!(result, expected);
         }
     }
     
@@ -868,38 +869,38 @@ mod test {
         let mut r = ::test::rng(403);
         let mut seed = [0u8; 16];
         
-        // We can't test which algorithm is used directly, but each should
-        // produce a different sample with the same parameters.
+        // We can't test which algorithm is used directly, but Floyd's alg
+        // should produce different results from the others.
         
         // A small length and relatively large amount should use inplace
         r.fill(&mut seed);
-        let (length, amount) = (100, 50);
-        let v1 = sample_indices(&mut xor_rng(seed), length, amount);
-        let v2 = sample_indices_inplace(&mut xor_rng(seed),
-            length as u32, amount as u32);
+        let (length, amount): (u32, u32) = (100, 50);
+        let v1 = sample_indices(&mut xor_rng(seed), length as usize, amount as usize);
+        let v2 = sample_indices_inplace(&mut xor_rng(seed), length, amount);
         assert!(v1.iter().all(|e| *e < length));
-        assert!(v1.iter().zip(v2.iter()).all(|(x,y)| *x == *y as usize));
+        assert_eq!(v1, v2);
         
-        // Test other algs do produce different results
+        // Test Floyd's alg does produce different results
         let v3 = sample_indices_floyd(&mut xor_rng(seed), length, amount);
-        let v4 = sample_indices_cache(&mut xor_rng(seed), length, amount);
         assert!(v1 != v3);
-        assert!(v1 != v4);
+        // However, the cache alg should produce the same results
+        let v4 = sample_indices_cache(&mut xor_rng(seed), length, amount);
+        assert_eq!(v1, v4);
         
         // A large length and small amount should use Floyd
         r.fill(&mut seed);
-        let (length, amount) = (1<<20, 50);
-        let v1 = sample_indices(&mut xor_rng(seed), length, amount);
+        let (length, amount): (u32, u32) = (1<<20, 50);
+        let v1 = sample_indices(&mut xor_rng(seed), length as usize, amount as usize);
         let v2 = sample_indices_floyd(&mut xor_rng(seed), length, amount);
         assert!(v1.iter().all(|e| *e < length));
         assert_eq!(v1, v2);
         
         // A large length and larger amount should use cache
         r.fill(&mut seed);
-        let (length, amount) = (1<<20, 600);
-        let v1 = sample_indices(&mut xor_rng(seed), length, amount);
+        let (length, amount): (u32, u32) = (1<<20, 600);
+        let v1 = sample_indices(&mut xor_rng(seed), length as usize, amount as usize);
         let v2 = sample_indices_cache(&mut xor_rng(seed), length, amount);
-        assert!(v1.iter().all(|e| *e < length));
+        assert!(v1.iter().all(|e| *e < length as u32));
         assert_eq!(v1, v2);
     }
     

From d4da64e272e768fb92a950153a00d4d154703680 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Sat, 2 Jun 2018 14:40:02 +0100
Subject: [PATCH 05/14] sample_indices: add 'shuffled' parameter

---
 CHANGELOG.md   |  5 +++
 benches/seq.rs |  4 +--
 src/seq.rs     | 88 +++++++++++++++++++++++++++++++-------------------
 3 files changed, 62 insertions(+), 35 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 716f6639a20..856d3ac35e4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,11 @@ A [separate changelog is kept for rand_core](rand_core/CHANGELOG.md).
 
 You may also find the [Update Guide](UPDATING.md) useful.
 
+## [0.6.0] - Unreleased
+
+### Sequences module
+- Optimised and changed return type of the `sample_indices` function. (#479)
+
 ## [0.5.4] - 2018-07-11
 ### Platform support
 - Make `OsRng` work via WASM/stdweb for WebWorkers
diff --git a/benches/seq.rs b/benches/seq.rs
index d844ab7f599..a38ad1148f3 100644
--- a/benches/seq.rs
+++ b/benches/seq.rs
@@ -39,7 +39,7 @@ macro_rules! seq_slice_choose_multiple {
                 // Collect full result to prevent unwanted shortcuts getting
                 // first element (in case sample_indices returns an iterator).
                 for (slot, sample) in result.iter_mut().zip(
-                    x.choose_multiple(&mut rng, $amount)) {
+                    x.choose_multiple(&mut rng, $amount, false)) {
                     *slot = *sample;
                 }
                 result[$amount-1]
@@ -87,7 +87,7 @@ macro_rules! sample_indices {
         fn $name(b: &mut Bencher) {
             let mut rng = SmallRng::from_rng(thread_rng()).unwrap();
             b.iter(|| {
-                $fn(&mut rng, $length, $amount)
+                $fn(&mut rng, $length, $amount, true)
             })
         }
     }
diff --git a/src/seq.rs b/src/seq.rs
index 2fd2dd2b443..72f7e58953d 100644
--- a/src/seq.rs
+++ b/src/seq.rs
@@ -66,12 +66,13 @@ pub trait SliceRandom {
     /// In case this API is not sufficiently flexible, use `sample_indices` then
     /// apply the indices to the slice.
     /// 
-    /// Although the elements are selected randomly, the order of returned
-    /// elements is neither stable nor fully random. If random ordering is
-    /// desired, either use `partial_shuffle` or use this method and shuffle
-    /// the result. If stable order is desired, use `sample_indices`, sort the
-    /// result, then apply to the slice.
-    /// 
+    /// If `shuffled == true` then the sampled values will be fully shuffled;
+    /// otherwise the values may only partially shuffled, depending on the
+    /// algorithm used (i.e. biases may exist in the ordering of sampled
+    /// elements). Depending on the algorithm used internally, full shuffling
+    /// may add significant overhead for `amount` > 10 or so, but not more
+    /// than double the time and often much less.
+    ///
     /// Complexity is expected to be the same as `sample_indices`.
     /// 
     /// # Example
@@ -82,16 +83,16 @@ pub trait SliceRandom {
     /// let sample = "Hello, audience!".as_bytes();
     /// 
     /// // collect the results into a vector:
-    /// let v: Vec<u8> = sample.choose_multiple(&mut rng, 3).cloned().collect();
+    /// let v: Vec<u8> = sample.choose_multiple(&mut rng, 3, true).cloned().collect();
     /// 
     /// // store in a buffer:
     /// let mut buf = [0u8; 5];
-    /// for (b, slot) in sample.choose_multiple(&mut rng, buf.len()).zip(buf.iter_mut()) {
+    /// for (b, slot) in sample.choose_multiple(&mut rng, buf.len(), true).zip(buf.iter_mut()) {
     ///     *slot = *b;
     /// }
     /// ```
     #[cfg(feature = "alloc")]
-    fn choose_multiple<R>(&self, rng: &mut R, amount: usize) -> SliceChooseIter<Self, Self::Item>
+    fn choose_multiple<R>(&self, rng: &mut R, amount: usize, shuffled: bool) -> SliceChooseIter<Self, Self::Item>
         where R: Rng + ?Sized;
 
     /// Similar to [`choose`], where the likelihood of each outcome may be
@@ -317,14 +318,15 @@ impl<T> SliceRandom for [T] {
     }
 
     #[cfg(feature = "alloc")]
-    fn choose_multiple<R>(&self, rng: &mut R, amount: usize) -> SliceChooseIter<Self, Self::Item>
+    fn choose_multiple<R>(&self, rng: &mut R, amount: usize, shuffled: bool)
+        -> SliceChooseIter<Self, Self::Item>
         where R: Rng + ?Sized
     {
         let amount = ::core::cmp::min(amount, self.len());
         SliceChooseIter {
             slice: self,
             _phantom: Default::default(),
-            indices: sample_indices(rng, self.len(), amount).into_iter(),
+            indices: sample_indices(rng, self.len(), amount, shuffled).into_iter(),
         }
     }
 
@@ -461,7 +463,7 @@ pub fn sample_slice<R, T>(rng: &mut R, slice: &[T], amount: usize) -> Vec<T>
     where R: Rng + ?Sized,
           T: Clone
 {
-    let indices = sample_indices(rng, slice.len(), amount);
+    let indices = sample_indices(rng, slice.len(), amount, true);
 
     let mut out = Vec::with_capacity(amount);
     out.extend(indices.iter().map(|i| slice[*i as usize].clone()));
@@ -484,24 +486,32 @@ pub fn sample_slice<R, T>(rng: &mut R, slice: &[T], amount: usize) -> Vec<T>
 pub fn sample_slice_ref<'a, R, T>(rng: &mut R, slice: &'a [T], amount: usize) -> Vec<&'a T>
     where R: Rng + ?Sized
 {
-    let indices = sample_indices(rng, slice.len(), amount);
+    let indices = sample_indices(rng, slice.len(), amount, true);
 
     let mut out = Vec::with_capacity(amount);
     out.extend(indices.iter().map(|i| &slice[*i as usize]));
     out
 }
 
-/// Randomly sample exactly `amount` indices from `0..length`.
+/// Randomly sample exactly `amount` distinct indices from `0..length`.
 ///
-/// The values are non-repeating and in random order.
+/// If `shuffled == true` then the sampled values will be fully shuffled;
+/// otherwise the values may only partially shuffled, depending on the
+/// algorithm used (i.e. biases may exist in the ordering of sampled elements).
+/// Depending on the algorithm used internally, full shuffling may add
+/// significant overhead for `amount` > 10 or so, but not more than double
+/// the time and often much less.
 ///
 /// This method is used internally by the slice sampling methods, but it can
 /// sometimes be useful to have the indices themselves so this is provided as
 /// an alternative.
 ///
 /// The implementation used is not specified; we automatically select the
-/// fastest available implementation. Roughly speaking, complexity is
-/// `O(amount)` if `amount` is small relative to `length`, otherwise `O(length)`.
+/// fastest available implementation for the `length` and `amount` parameters
+/// (based on detailed profiling on an Intel Haswell CPU). Roughly speaking,
+/// complexity is `O(amount)`, except that when `amount` is small, performance
+/// is closer to `O(amount^2)`, and when `length` is close to `amount` then
+/// `O(length)`.
 ///
 /// Note that we only support `u32` indices since this covers the vast majority
 /// of uses, and performance is significantly better than with `u64`.
@@ -511,7 +521,8 @@ pub fn sample_slice_ref<'a, R, T>(rng: &mut R, slice: &'a [T], amount: usize) ->
 ///
 /// Panics if `amount > length` or if `length` is not reprentable as a `u32`.
 #[cfg(feature = "alloc")]
-pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize) -> Vec<u32>
+pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize,
+    shuffled: bool) -> Vec<u32>
     where R: Rng + ?Sized,
 {
     if amount > length {
@@ -534,7 +545,7 @@ pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize) -> Vec<u32>
         if C[0][j] * (length as u64) < (C[1][j] + m4) * amount as u64 {
             sample_indices_inplace(rng, length, amount)
         } else {
-            sample_indices_floyd(rng, length, amount)
+            sample_indices_floyd(rng, length, amount, shuffled)
         }
     } else {
         const C: [[u64; 2]; 2] = [[1, 36], [62*40, 68*40]];
@@ -549,19 +560,30 @@ pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize) -> Vec<u32>
 
 /// Randomly sample exactly `amount` indices from `0..length`, using Floyd's
 /// combination algorithm.
+/// 
+/// If `shuffled == false`, the values are only partially shuffled (i.e. biases
+/// exist in the ordering of sampled elements). If `shuffled == true`, the
+/// values are fully shuffled.
 ///
 /// This implementation uses `O(amount)` memory and `O(amount^2)` time.
 #[cfg(feature = "alloc")]
-fn sample_indices_floyd<R>(rng: &mut R, length: u32, amount: u32)
-    -> Vec<u32>
+fn sample_indices_floyd<R>(rng: &mut R, length: u32, amount: u32, shuffled: bool) -> Vec<u32>
     where R: Rng + ?Sized,
 {
     debug_assert!(amount <= length);
     let mut indices = Vec::with_capacity(amount as usize);
     for j in length - amount .. length {
         let t = rng.gen_range(0, j + 1);
-        let t = if indices.contains(&t) { j } else { t };
-        indices.push( t );
+        if indices.contains(&t) {
+            indices.push(j)
+        } else {
+            indices.push(t)
+        };
+    }
+    if shuffled {
+        // Note that there is a variant of Floyd's algorithm with native full
+        // shuffling, but it is slow because it requires arbitrary insertions.
+        indices.shuffle(rng);
     }
     indices
 }
@@ -800,16 +822,16 @@ mod test {
         assert_eq!(&sample_indices_cache(&mut r, 1, 0)[..], [0; 0]);
         assert_eq!(&sample_indices_cache(&mut r, 1, 1)[..], [0]);
 
-        assert_eq!(&sample_indices_floyd(&mut r, 0, 0)[..], [0; 0]);
-        assert_eq!(&sample_indices_floyd(&mut r, 1, 0)[..], [0; 0]);
-        assert_eq!(&sample_indices_floyd(&mut r, 1, 1)[..], [0]);
+        assert_eq!(&sample_indices_floyd(&mut r, 0, 0, false)[..], [0; 0]);
+        assert_eq!(&sample_indices_floyd(&mut r, 1, 0, false)[..], [0; 0]);
+        assert_eq!(&sample_indices_floyd(&mut r, 1, 1, false)[..], [0]);
         
         // These algorithms should be fast with big numbers. Test average.
         let sum = sample_indices_cache(&mut r, 1 << 25, 10)
             .iter().fold(0, |a, b| a + b);
         assert!(1 << 25 < sum && sum < (1 << 25) * 25);
         
-        let sum = sample_indices_floyd(&mut r, 1 << 25, 10)
+        let sum = sample_indices_floyd(&mut r, 1 << 25, 10, false)
             .iter().fold(0, |a, b| a + b);
         assert!(1 << 25 < sum && sum < (1 << 25) * 25);
 
@@ -846,7 +868,7 @@ mod test {
 
             // assert the basics work
             let regular = sample_indices(
-                &mut xor_rng(seed), length, amount);
+                &mut xor_rng(seed), length, amount, true);
             assert_eq!(regular.len(), amount);
             assert!(regular.iter().all(|e| *e < length as u32));
 
@@ -875,13 +897,13 @@ mod test {
         // A small length and relatively large amount should use inplace
         r.fill(&mut seed);
         let (length, amount): (u32, u32) = (100, 50);
-        let v1 = sample_indices(&mut xor_rng(seed), length as usize, amount as usize);
+        let v1 = sample_indices(&mut xor_rng(seed), length as usize, amount as usize, true);
         let v2 = sample_indices_inplace(&mut xor_rng(seed), length, amount);
         assert!(v1.iter().all(|e| *e < length));
         assert_eq!(v1, v2);
         
         // Test Floyd's alg does produce different results
-        let v3 = sample_indices_floyd(&mut xor_rng(seed), length, amount);
+        let v3 = sample_indices_floyd(&mut xor_rng(seed), length, amount, true);
         assert!(v1 != v3);
         // However, the cache alg should produce the same results
         let v4 = sample_indices_cache(&mut xor_rng(seed), length, amount);
@@ -890,15 +912,15 @@ mod test {
         // A large length and small amount should use Floyd
         r.fill(&mut seed);
         let (length, amount): (u32, u32) = (1<<20, 50);
-        let v1 = sample_indices(&mut xor_rng(seed), length as usize, amount as usize);
-        let v2 = sample_indices_floyd(&mut xor_rng(seed), length, amount);
+        let v1 = sample_indices(&mut xor_rng(seed), length as usize, amount as usize, true);
+        let v2 = sample_indices_floyd(&mut xor_rng(seed), length, amount, true);
         assert!(v1.iter().all(|e| *e < length));
         assert_eq!(v1, v2);
         
         // A large length and larger amount should use cache
         r.fill(&mut seed);
         let (length, amount): (u32, u32) = (1<<20, 600);
-        let v1 = sample_indices(&mut xor_rng(seed), length as usize, amount as usize);
+        let v1 = sample_indices(&mut xor_rng(seed), length as usize, amount as usize, true);
         let v2 = sample_indices_cache(&mut xor_rng(seed), length, amount);
         assert!(v1.iter().all(|e| *e < length as u32));
         assert_eq!(v1, v2);

From 98889f21bf3cc30b1e92f9d13cdd772356423d76 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Sat, 2 Jun 2018 15:22:47 +0100
Subject: [PATCH 06/14] sample_indices: update selection heuristics

Update with new benchmark data from `u32` impls of Floyd's and cached
algorithms (inplace alg already used benchmarks from `u32` impl).
Update Floyd's with a balanced model adequate for both shuffled
and unshuffled versions.
---
 src/seq.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/seq.rs b/src/seq.rs
index 72f7e58953d..62b6824f548 100644
--- a/src/seq.rs
+++ b/src/seq.rs
@@ -538,17 +538,17 @@ pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize,
     // https://github.com/rust-lang-nursery/rand/pull/479
     // We do some calculations with u64 to avoid overflow.
 
-    if amount < 517 {
-        const C: [[u64; 2]; 2] = [[1, 36], [200, 440]];
+    if amount < 442 {
+        const C: [[u64; 2]; 2] = [[5, 45], [50, 350]];
         let j = if length < 500_000 { 0 } else { 1 };
-        let m4 = 4 * amount as u64;
+        let m4 = 6 * amount as u64;
         if C[0][j] * (length as u64) < (C[1][j] + m4) * amount as u64 {
             sample_indices_inplace(rng, length, amount)
         } else {
             sample_indices_floyd(rng, length, amount, shuffled)
         }
     } else {
-        const C: [[u64; 2]; 2] = [[1, 36], [62*40, 68*40]];
+        const C: [[u64; 2]; 2] = [[1, 9], [590, 600]];
         let j = if length < 500_000 { 0 } else { 1 };
         if C[0][j] * (length as u64) < C[1][j] * amount as u64 {
             sample_indices_inplace(rng, length, amount)

From c95f8969d3525a0acb82929cda26ba162439fc80 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Wed, 13 Jun 2018 12:22:17 +0100
Subject: [PATCH 07/14] sample_indices: use f32 in heuristics and add short-cut

Motivation: don't have to worry about overflow whichever index type is used.
Appears to slightly improve some benchmarks, no affect on others.
---
 src/seq.rs | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/seq.rs b/src/seq.rs
index 62b6824f548..71867ee6b22 100644
--- a/src/seq.rs
+++ b/src/seq.rs
@@ -536,21 +536,23 @@ pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize,
     
     // Choice of algorithm here depends on both length and amount. See:
     // https://github.com/rust-lang-nursery/rand/pull/479
-    // We do some calculations with u64 to avoid overflow.
+    // We do some calculations with f32. Accuracy is not very important.
 
     if amount < 442 {
-        const C: [[u64; 2]; 2] = [[5, 45], [50, 350]];
+        const C: [[f32; 2]; 2] = [[1.2, 6.0/45.0], [10.0, 70.0/9.0]];
         let j = if length < 500_000 { 0 } else { 1 };
-        let m4 = 6 * amount as u64;
-        if C[0][j] * (length as u64) < (C[1][j] + m4) * amount as u64 {
+        let amount_fp = amount as f32;
+        let m4 = C[0][j] * amount_fp;
+        // Short-cut: when amount < 12, floyd's is always faster
+        if amount > 11 && (length as f32) < (C[1][j] + m4) * amount_fp {
             sample_indices_inplace(rng, length, amount)
         } else {
             sample_indices_floyd(rng, length, amount, shuffled)
         }
     } else {
-        const C: [[u64; 2]; 2] = [[1, 9], [590, 600]];
+        const C: [f32; 2] = [590.0, 600.0/9.0];
         let j = if length < 500_000 { 0 } else { 1 };
-        if C[0][j] * (length as u64) < C[1][j] * amount as u64 {
+        if (length as f32) < C[j] * (amount as f32) {
             sample_indices_inplace(rng, length, amount)
         } else {
             sample_indices_cache(rng, length, amount)

From 59d0823a78ec431c05be98fbbf5688861ad45580 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Thu, 28 Jun 2018 14:55:05 +0100
Subject: [PATCH 08/14] sample_indices: abstract over return type

This is to allow use of u32 or usize internally
---
 src/lib.rs |   2 +-
 src/seq.rs | 206 +++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 160 insertions(+), 48 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index f07a68c495a..197fc2546fc 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -234,7 +234,7 @@
 #![cfg_attr(feature = "wasm-bindgen", feature(wasm_import_module))]
 
 #[cfg(feature = "std")] extern crate core;
-#[cfg(all(feature = "alloc", not(feature="std")))] extern crate alloc;
+#[cfg(all(feature = "alloc", not(feature="std")))] #[macro_use] extern crate alloc;
 
 #[cfg(feature="simd_support")] extern crate packed_simd;
 
diff --git a/src/seq.rs b/src/seq.rs
index 71867ee6b22..0e598f2a8dc 100644
--- a/src/seq.rs
+++ b/src/seq.rs
@@ -13,6 +13,7 @@
 //! TODO: module doc
 
 #[cfg(feature="alloc")] use core::ops::Index;
+#[cfg(feature="alloc")] use core::slice;
 
 #[cfg(feature="std")] use std::vec;
 #[cfg(all(feature="alloc", not(feature="std")))] use alloc::vec;
@@ -326,7 +327,7 @@ impl<T> SliceRandom for [T] {
         SliceChooseIter {
             slice: self,
             _phantom: Default::default(),
-            indices: sample_indices(rng, self.len(), amount, shuffled).into_iter(),
+            indices: sample_indices(rng, self.len(), amount, shuffled).into_iter_usize(),
         }
     }
 
@@ -398,7 +399,7 @@ impl<I> IteratorRandom for I where I: Iterator + Sized {}
 pub struct SliceChooseIter<'a, S: ?Sized + 'a, T: 'a> {
     slice: &'a S,
     _phantom: ::core::marker::PhantomData<T>,
-    indices: vec::IntoIter<u32>,
+    indices: IndicesIntoIter,
 }
 
 #[cfg(feature = "alloc")]
@@ -466,7 +467,7 @@ pub fn sample_slice<R, T>(rng: &mut R, slice: &[T], amount: usize) -> Vec<T>
     let indices = sample_indices(rng, slice.len(), amount, true);
 
     let mut out = Vec::with_capacity(amount);
-    out.extend(indices.iter().map(|i| slice[*i as usize].clone()));
+    out.extend(indices.iter_usize().map(|i| slice[i].clone()));
     out
 }
 
@@ -489,10 +490,126 @@ pub fn sample_slice_ref<'a, R, T>(rng: &mut R, slice: &'a [T], amount: usize) ->
     let indices = sample_indices(rng, slice.len(), amount, true);
 
     let mut out = Vec::with_capacity(amount);
-    out.extend(indices.iter().map(|i| &slice[*i as usize]));
+    out.extend(indices.iter_usize().map(|i| &slice[i]));
     out
 }
 
+/// Return type of `sample_indices`.
+#[cfg(feature = "alloc")]
+#[derive(Clone, Debug, PartialEq)]
+pub enum Indices {
+    /// Representation: a vector over `u32` values
+    U32(Vec<u32>)
+}
+
+#[cfg(feature = "alloc")]
+impl Indices {
+    /// Returns the number of indices
+    pub fn len(&self) -> usize {
+        match self {
+            &Indices::U32(ref v) => v.len(),
+        }
+    }
+
+    /// Return result as a `Vec<usize>`. Conversion may or may not be trivial.
+    pub fn into_vec_usize(self) -> Vec<usize> {
+        match self {
+            Indices::U32(v) => v.into_iter().map(|i| i as usize).collect(),
+        }
+    }
+
+    /// Iterate over the indices as a sequence of `usize` values
+    pub fn iter_usize<'a>(&'a self) -> IndicesIter<'a> {
+        match self {
+            &Indices::U32(ref v) => IndicesIter::U32(v.iter()),
+        }
+    }
+    
+    /// Convert into an iterator over the indices as a sequence of `usize` values
+    pub fn into_iter_usize(self) -> IndicesIntoIter {
+        match self {
+            Indices::U32(v) => IndicesIntoIter::U32(v.into_iter()),
+        }
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl From<Vec<u32>> for Indices {
+    fn from(v: Vec<u32>) -> Self {
+        Indices::U32(v)
+    }
+}
+
+/// Return type of `Indices::iter_usize`.
+#[cfg(feature = "alloc")]
+#[derive(Debug)]
+pub enum IndicesIter<'a> {
+    #[doc(hidden)] U32(slice::Iter<'a, u32>),
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> Iterator for IndicesIter<'a> {
+    type Item = usize;
+    fn next(&mut self) -> Option<usize> {
+        use self::IndicesIter::*;
+        match self {
+            &mut U32(ref mut iter) => iter.next().map(|i| *i as usize),
+        }
+    }
+    
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        match self {
+            &IndicesIter::U32(ref v) => v.size_hint(),
+        }
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> ExactSizeIterator for IndicesIter<'a> {
+    fn len(&self) -> usize {
+        match self {
+            &IndicesIter::U32(ref v) => v.len(),
+        }
+    }
+}
+
+/// Return type of `Indices::into_iter_usize`.
+#[cfg(feature = "alloc")]
+#[derive(Clone, Debug)]
+pub enum IndicesIntoIter {
+    #[doc(hidden)] U32(vec::IntoIter<u32>),
+}
+
+#[cfg(feature = "alloc")]
+impl Iterator for IndicesIntoIter {
+    type Item = usize;
+    
+    fn next(&mut self) -> Option<Self::Item> {
+        use self::IndicesIntoIter::*;
+        match self {
+            &mut U32(ref mut v) => v.next().map(|i| i as usize),
+        }
+    }
+    
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        use self::IndicesIntoIter::*;
+        match self {
+            &U32(ref v) => v.size_hint(),
+        }
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl ExactSizeIterator for IndicesIntoIter {
+    fn len(&self) -> usize {
+        use self::IndicesIntoIter::*;
+        match self {
+            &U32(ref v) => v.len(),
+        }
+    }
+}
+
+
 /// Randomly sample exactly `amount` distinct indices from `0..length`.
 ///
 /// If `shuffled == true` then the sampled values will be fully shuffled;
@@ -522,7 +639,7 @@ pub fn sample_slice_ref<'a, R, T>(rng: &mut R, slice: &'a [T], amount: usize) ->
 /// Panics if `amount > length` or if `length` is not reprentable as a `u32`.
 #[cfg(feature = "alloc")]
 pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize,
-    shuffled: bool) -> Vec<u32>
+    shuffled: bool) -> Indices
     where R: Rng + ?Sized,
 {
     if amount > length {
@@ -569,7 +686,7 @@ pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize,
 ///
 /// This implementation uses `O(amount)` memory and `O(amount^2)` time.
 #[cfg(feature = "alloc")]
-fn sample_indices_floyd<R>(rng: &mut R, length: u32, amount: u32, shuffled: bool) -> Vec<u32>
+fn sample_indices_floyd<R>(rng: &mut R, length: u32, amount: u32, shuffled: bool) -> Indices
     where R: Rng + ?Sized,
 {
     debug_assert!(amount <= length);
@@ -587,7 +704,7 @@ fn sample_indices_floyd<R>(rng: &mut R, length: u32, amount: u32, shuffled: bool
         // shuffling, but it is slow because it requires arbitrary insertions.
         indices.shuffle(rng);
     }
-    indices
+    Indices::from(indices)
 }
 
 /// Randomly sample exactly `amount` indices from `0..length`, using an inplace
@@ -605,8 +722,7 @@ fn sample_indices_floyd<R>(rng: &mut R, length: u32, amount: u32, shuffled: bool
 /// allocations. Set-up is `O(length)` time and memory and shuffling is
 /// `O(amount)` time.
 #[cfg(feature = "alloc")]
-fn sample_indices_inplace<R>(rng: &mut R, length: u32, amount: u32)
-    -> Vec<u32>
+fn sample_indices_inplace<R>(rng: &mut R, length: u32, amount: u32) -> Indices
     where R: Rng + ?Sized,
 {
     debug_assert!(amount <= length);
@@ -618,7 +734,7 @@ fn sample_indices_inplace<R>(rng: &mut R, length: u32, amount: u32)
     }
     indices.truncate(amount as usize);
     debug_assert_eq!(indices.len(), amount as usize);
-    indices
+    Indices::from(indices)
 }
 
 /// Randomly sample exactly `amount` indices from `0..length`, using a
@@ -629,8 +745,7 @@ fn sample_indices_inplace<R>(rng: &mut R, length: u32, amount: u32)
 /// values from `1_000_000`. The algorithm is `O(amount)` time and memory,
 /// but due to overheads will often be slower than other approaches.
 #[cfg(feature = "alloc")]
-fn sample_indices_cache<R>(rng: &mut R, length: u32, amount: u32)
-    -> Vec<u32>
+fn sample_indices_cache<R>(rng: &mut R, length: u32, amount: u32) -> Indices
     where R: Rng + ?Sized,
 {
     debug_assert!(amount <= length);
@@ -655,7 +770,7 @@ fn sample_indices_cache<R>(rng: &mut R, length: u32, amount: u32)
         indices.push(x_j);  // push at position i
     }
     debug_assert_eq!(indices.len(), amount as usize);
-    indices
+    Indices::from(indices)
 }
 
 #[cfg(test)]
@@ -816,25 +931,25 @@ mod test {
         let v = sample_slice(&mut r, &[42, 133], 2);
         assert!(&v[..] == [42, 133] || v[..] == [133, 42]);
 
-        assert_eq!(&sample_indices_inplace(&mut r, 0, 0)[..], [0; 0]);
-        assert_eq!(&sample_indices_inplace(&mut r, 1, 0)[..], [0; 0]);
-        assert_eq!(&sample_indices_inplace(&mut r, 1, 1)[..], [0]);
+        assert_eq!(sample_indices_inplace(&mut r, 0, 0).len(), 0);
+        assert_eq!(sample_indices_inplace(&mut r, 1, 0).len(), 0);
+        assert_eq!(sample_indices_inplace(&mut r, 1, 1).into_vec_usize(), vec![0]);
 
-        assert_eq!(&sample_indices_cache(&mut r, 0, 0)[..], [0; 0]);
-        assert_eq!(&sample_indices_cache(&mut r, 1, 0)[..], [0; 0]);
-        assert_eq!(&sample_indices_cache(&mut r, 1, 1)[..], [0]);
+        assert_eq!(sample_indices_cache(&mut r, 0, 0).len(), 0);
+        assert_eq!(sample_indices_cache(&mut r, 1, 0).len(), 0);
+        assert_eq!(sample_indices_cache(&mut r, 1, 1).into_vec_usize(), vec![0]);
 
-        assert_eq!(&sample_indices_floyd(&mut r, 0, 0, false)[..], [0; 0]);
-        assert_eq!(&sample_indices_floyd(&mut r, 1, 0, false)[..], [0; 0]);
-        assert_eq!(&sample_indices_floyd(&mut r, 1, 1, false)[..], [0]);
+        assert_eq!(sample_indices_floyd(&mut r, 0, 0, false).len(), 0);
+        assert_eq!(sample_indices_floyd(&mut r, 1, 0, false).len(), 0);
+        assert_eq!(sample_indices_floyd(&mut r, 1, 1, false).into_vec_usize(), vec![0]);
         
         // These algorithms should be fast with big numbers. Test average.
-        let sum = sample_indices_cache(&mut r, 1 << 25, 10)
-            .iter().fold(0, |a, b| a + b);
+        let indices = sample_indices_cache(&mut r, 1 << 25, 10);
+        let sum: usize = indices.iter_usize().sum();
         assert!(1 << 25 < sum && sum < (1 << 25) * 25);
         
-        let sum = sample_indices_floyd(&mut r, 1 << 25, 10, false)
-            .iter().fold(0, |a, b| a + b);
+        let indices = sample_indices_floyd(&mut r, 1 << 25, 10, false);
+        let sum: usize = indices.iter_usize().sum();
         assert!(1 << 25 < sum && sum < (1 << 25) * 25);
 
         // Make sure lucky 777's aren't lucky
@@ -872,16 +987,15 @@ mod test {
             let regular = sample_indices(
                 &mut xor_rng(seed), length, amount, true);
             assert_eq!(regular.len(), amount);
-            assert!(regular.iter().all(|e| *e < length as u32));
+            assert!(regular.iter_usize().all(|e| e < length));
 
             // also test that sampling the slice works
             let vec: Vec<u32> = (0..(length as u32)).collect();
             let result = sample_slice(&mut xor_rng(seed), &vec, amount);
-            assert_eq!(result, regular);
+            assert_eq!(result, regular.iter_usize().map(|i| i as u32).collect::<Vec<_>>());
 
             let result = sample_slice_ref(&mut xor_rng(seed), &vec, amount);
-            let expected = regular.iter().map(|v| v).collect::<Vec<_>>();
-            assert_eq!(result, expected);
+            assert!(result.iter().zip(regular.iter_usize()).all(|(i,j)| **i == j as u32));
         }
     }
     
@@ -894,37 +1008,35 @@ mod test {
         let mut seed = [0u8; 16];
         
         // We can't test which algorithm is used directly, but Floyd's alg
-        // should produce different results from the others.
+        // should produce different results from the others. (Also, `inplace`
+        // and `cached` currently use different sizes thus produce different results.)
         
         // A small length and relatively large amount should use inplace
         r.fill(&mut seed);
-        let (length, amount): (u32, u32) = (100, 50);
-        let v1 = sample_indices(&mut xor_rng(seed), length as usize, amount as usize, true);
-        let v2 = sample_indices_inplace(&mut xor_rng(seed), length, amount);
-        assert!(v1.iter().all(|e| *e < length));
+        let (length, amount): (usize, usize) = (100, 50);
+        let v1 = sample_indices(&mut xor_rng(seed), length, amount, true);
+        let v2 = sample_indices_inplace(&mut xor_rng(seed), length as u32, amount as u32);
+        assert!(v1.iter_usize().all(|e| e < length));
         assert_eq!(v1, v2);
         
         // Test Floyd's alg does produce different results
-        let v3 = sample_indices_floyd(&mut xor_rng(seed), length, amount, true);
+        let v3 = sample_indices_floyd(&mut xor_rng(seed), length as u32, amount as u32, true);
         assert!(v1 != v3);
-        // However, the cache alg should produce the same results
-        let v4 = sample_indices_cache(&mut xor_rng(seed), length, amount);
-        assert_eq!(v1, v4);
         
         // A large length and small amount should use Floyd
         r.fill(&mut seed);
-        let (length, amount): (u32, u32) = (1<<20, 50);
-        let v1 = sample_indices(&mut xor_rng(seed), length as usize, amount as usize, true);
-        let v2 = sample_indices_floyd(&mut xor_rng(seed), length, amount, true);
-        assert!(v1.iter().all(|e| *e < length));
+        let (length, amount): (usize, usize) = (1<<20, 50);
+        let v1 = sample_indices(&mut xor_rng(seed), length, amount, true);
+        let v2 = sample_indices_floyd(&mut xor_rng(seed), length as u32, amount as u32, true);
+        assert!(v1.iter_usize().all(|e| e < length));
         assert_eq!(v1, v2);
         
         // A large length and larger amount should use cache
         r.fill(&mut seed);
-        let (length, amount): (u32, u32) = (1<<20, 600);
-        let v1 = sample_indices(&mut xor_rng(seed), length as usize, amount as usize, true);
-        let v2 = sample_indices_cache(&mut xor_rng(seed), length, amount);
-        assert!(v1.iter().all(|e| *e < length as u32));
+        let (length, amount): (usize, usize) = (1<<20, 600);
+        let v1 = sample_indices(&mut xor_rng(seed), length, amount, true);
+        let v2 = sample_indices_cache(&mut xor_rng(seed), length as u32, amount as u32);
+        assert!(v1.iter_usize().all(|e| e < length));
         assert_eq!(v1, v2);
     }
     

From 5fc1da8d8cb1a21e6c7327073868d659943ff353 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Thu, 28 Jun 2018 15:03:07 +0100
Subject: [PATCH 09/14] sample_indices: also support usize internally

---
 src/seq.rs | 65 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 13 deletions(-)

diff --git a/src/seq.rs b/src/seq.rs
index 0e598f2a8dc..78849b3b460 100644
--- a/src/seq.rs
+++ b/src/seq.rs
@@ -496,10 +496,12 @@ pub fn sample_slice_ref<'a, R, T>(rng: &mut R, slice: &'a [T], amount: usize) ->
 
 /// Return type of `sample_indices`.
 #[cfg(feature = "alloc")]
-#[derive(Clone, Debug, PartialEq)]
+#[derive(Clone, Debug)]
 pub enum Indices {
     /// Representation: a vector over `u32` values
-    U32(Vec<u32>)
+    U32(Vec<u32>),
+    /// Representation: a vector over `usize` values
+    USize(Vec<usize>),
 }
 
 #[cfg(feature = "alloc")]
@@ -508,6 +510,7 @@ impl Indices {
     pub fn len(&self) -> usize {
         match self {
             &Indices::U32(ref v) => v.len(),
+            &Indices::USize(ref v) => v.len(),
         }
     }
 
@@ -515,6 +518,7 @@ impl Indices {
     pub fn into_vec_usize(self) -> Vec<usize> {
         match self {
             Indices::U32(v) => v.into_iter().map(|i| i as usize).collect(),
+            Indices::USize(v) => v,
         }
     }
 
@@ -522,6 +526,7 @@ impl Indices {
     pub fn iter_usize<'a>(&'a self) -> IndicesIter<'a> {
         match self {
             &Indices::U32(ref v) => IndicesIter::U32(v.iter()),
+            &Indices::USize(ref v) => IndicesIter::USize(v.iter()),
         }
     }
     
@@ -529,6 +534,20 @@ impl Indices {
     pub fn into_iter_usize(self) -> IndicesIntoIter {
         match self {
             Indices::U32(v) => IndicesIntoIter::U32(v.into_iter()),
+            Indices::USize(v) => IndicesIntoIter::USize(v.into_iter()),
+        }
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl PartialEq for Indices {
+    fn eq(&self, other: &Indices) -> bool {
+        use self::Indices::*;
+        match (self, other) {
+            (&U32(ref v1), &U32(ref v2)) => v1 == v2,
+            (&USize(ref v1), &USize(ref v2)) => v1 == v2,
+            (a @ _, b @ _) => (a.len() == b.len()) &&
+                    (a.iter_usize().zip(b.iter_usize()).all(|(x, y)| x == y)),
         }
     }
 }
@@ -540,11 +559,19 @@ impl From<Vec<u32>> for Indices {
     }
 }
 
+#[cfg(feature = "alloc")]
+impl From<Vec<usize>> for Indices {
+    fn from(v: Vec<usize>) -> Self {
+        Indices::USize(v)
+    }
+}
+
 /// Return type of `Indices::iter_usize`.
 #[cfg(feature = "alloc")]
 #[derive(Debug)]
 pub enum IndicesIter<'a> {
     #[doc(hidden)] U32(slice::Iter<'a, u32>),
+    #[doc(hidden)] USize(slice::Iter<'a, usize>),
 }
 
 #[cfg(feature = "alloc")]
@@ -554,12 +581,14 @@ impl<'a> Iterator for IndicesIter<'a> {
         use self::IndicesIter::*;
         match self {
             &mut U32(ref mut iter) => iter.next().map(|i| *i as usize),
+            &mut USize(ref mut iter) => iter.next().cloned(),
         }
     }
     
     fn size_hint(&self) -> (usize, Option<usize>) {
         match self {
             &IndicesIter::U32(ref v) => v.size_hint(),
+            &IndicesIter::USize(ref v) => v.size_hint(),
         }
     }
 }
@@ -569,6 +598,7 @@ impl<'a> ExactSizeIterator for IndicesIter<'a> {
     fn len(&self) -> usize {
         match self {
             &IndicesIter::U32(ref v) => v.len(),
+            &IndicesIter::USize(ref v) => v.len(),
         }
     }
 }
@@ -578,6 +608,7 @@ impl<'a> ExactSizeIterator for IndicesIter<'a> {
 #[derive(Clone, Debug)]
 pub enum IndicesIntoIter {
     #[doc(hidden)] U32(vec::IntoIter<u32>),
+    #[doc(hidden)] USize(vec::IntoIter<usize>),
 }
 
 #[cfg(feature = "alloc")]
@@ -588,6 +619,7 @@ impl Iterator for IndicesIntoIter {
         use self::IndicesIntoIter::*;
         match self {
             &mut U32(ref mut v) => v.next().map(|i| i as usize),
+            &mut USize(ref mut v) => v.next(),
         }
     }
     
@@ -595,6 +627,7 @@ impl Iterator for IndicesIntoIter {
         use self::IndicesIntoIter::*;
         match self {
             &U32(ref v) => v.size_hint(),
+            &USize(ref v) => v.size_hint(),
         }
     }
 }
@@ -605,6 +638,7 @@ impl ExactSizeIterator for IndicesIntoIter {
         use self::IndicesIntoIter::*;
         match self {
             &U32(ref v) => v.len(),
+            &USize(ref v) => v.len(),
         }
     }
 }
@@ -630,13 +664,14 @@ impl ExactSizeIterator for IndicesIntoIter {
 /// is closer to `O(amount^2)`, and when `length` is close to `amount` then
 /// `O(length)`.
 ///
-/// Note that we only support `u32` indices since this covers the vast majority
-/// of uses, and performance is significantly better than with `u64`.
+/// Note that performance is significantly better over `u32` indices than over
+/// `u64` indices. Because of this we hide the underlying type behind an
+/// abstraction, `Indices`.
 /// 
 /// If an allocation-free `no_std` function is required, it is suggested
 /// to adapt the internal `sample_indices_floyd` implementation.
 ///
-/// Panics if `amount > length` or if `length` is not reprentable as a `u32`.
+/// Panics if `amount > length`.
 #[cfg(feature = "alloc")]
 pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize,
     shuffled: bool) -> Indices
@@ -646,7 +681,9 @@ pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize,
         panic!("`amount` of samples must be less than or equal to `length`");
     }
     if length > (::core::u32::MAX as usize) {
-        panic!("`length` is not representable as `u32`");
+        // We never want to use inplace here, but could use floyd's alg
+        // Lazy version: always use the cache alg.
+        return sample_indices_cache(rng, length, amount);
     }
     let amount = amount as u32;
     let length = length as u32;
@@ -672,7 +709,9 @@ pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize,
         if (length as f32) < C[j] * (amount as f32) {
             sample_indices_inplace(rng, length, amount)
         } else {
-            sample_indices_cache(rng, length, amount)
+            // note: could have a specific u32 impl, but I'm lazy and
+            // generics don't have usable conversions
+            sample_indices_cache(rng, length as usize, amount as usize)
         }
     }
 }
@@ -745,15 +784,15 @@ fn sample_indices_inplace<R>(rng: &mut R, length: u32, amount: u32) -> Indices
 /// values from `1_000_000`. The algorithm is `O(amount)` time and memory,
 /// but due to overheads will often be slower than other approaches.
 #[cfg(feature = "alloc")]
-fn sample_indices_cache<R>(rng: &mut R, length: u32, amount: u32) -> Indices
+fn sample_indices_cache<R>(rng: &mut R, length: usize, amount: usize) -> Indices
     where R: Rng + ?Sized,
 {
     debug_assert!(amount <= length);
-    #[cfg(feature="std")] let mut cache = HashMap::with_capacity(amount as usize);
+    #[cfg(feature="std")] let mut cache = HashMap::with_capacity(amount);
     #[cfg(not(feature="std"))] let mut cache = BTreeMap::new();
-    let mut indices = Vec::with_capacity(amount as usize);
+    let mut indices = Vec::with_capacity(amount);
     for i in 0..amount {
-        let j: u32 = rng.gen_range(i, length);
+        let j: usize = rng.gen_range(i, length);
 
         // get the current values at i and j ...
         let x_i = match cache.get(&i) {
@@ -769,7 +808,7 @@ fn sample_indices_cache<R>(rng: &mut R, length: u32, amount: u32) -> Indices
         cache.insert(j, x_i);
         indices.push(x_j);  // push at position i
     }
-    debug_assert_eq!(indices.len(), amount as usize);
+    debug_assert_eq!(indices.len(), amount);
     Indices::from(indices)
 }
 
@@ -1035,7 +1074,7 @@ mod test {
         r.fill(&mut seed);
         let (length, amount): (usize, usize) = (1<<20, 600);
         let v1 = sample_indices(&mut xor_rng(seed), length, amount, true);
-        let v2 = sample_indices_cache(&mut xor_rng(seed), length as u32, amount as u32);
+        let v2 = sample_indices_cache(&mut xor_rng(seed), length, amount);
         assert!(v1.iter_usize().all(|e| e < length));
         assert_eq!(v1, v2);
     }

From d624e840ba24ec5620cf57ccb2376e110923b538 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Mon, 2 Jul 2018 13:05:45 +0100
Subject: [PATCH 10/14] sample_indices_cache: use rejection sampling instead

---
 src/seq.rs | 57 ++++++++++++++++++++++--------------------------------
 1 file changed, 23 insertions(+), 34 deletions(-)

diff --git a/src/seq.rs b/src/seq.rs
index 78849b3b460..e9648181564 100644
--- a/src/seq.rs
+++ b/src/seq.rs
@@ -19,12 +19,12 @@
 #[cfg(all(feature="alloc", not(feature="std")))] use alloc::vec;
 #[cfg(all(feature="alloc", not(feature="std")))] use alloc::vec::Vec;
 // BTreeMap is not as fast in tests, but better than nothing.
-#[cfg(feature="std")] use std::collections::HashMap;
-#[cfg(all(feature="alloc", not(feature="std")))] use alloc::collections::BTreeMap;
+#[cfg(feature="std")] use std::collections::{HashSet};
+#[cfg(all(feature="alloc", not(feature="std")))] use alloc::collections::BTreeSet;
 
-#[cfg(feature = "alloc")] use distributions::WeightedError;
-
-use super::Rng;
+use Rng;
+#[cfg(feature="alloc")] use distributions::{Distribution, Uniform};
+#[cfg(feature="alloc")] use distributions::WeightedError;
 #[cfg(feature="alloc")] use distributions::uniform::{SampleUniform, SampleBorrow};
 
 /// Extension trait on slices, providing random mutation and sampling methods.
@@ -408,7 +408,7 @@ impl<'a, S: Index<usize, Output = T> + ?Sized + 'a, T: 'a> Iterator for SliceCho
 
     fn next(&mut self) -> Option<Self::Item> {
         // TODO: investigate using SliceIndex::get_unchecked when stable
-        self.indices.next().map(|i| &(*self.slice)[i as usize])
+        self.indices.next().map(|i| &self.slice[i as usize])
     }
     
     fn size_hint(&self) -> (usize, Option<usize>) {
@@ -776,38 +776,29 @@ fn sample_indices_inplace<R>(rng: &mut R, length: u32, amount: u32) -> Indices
     Indices::from(indices)
 }
 
-/// Randomly sample exactly `amount` indices from `0..length`, using a
-/// dynamically-cached partial Fisher-Yates method.
-///
-/// The cache avoids allocating the entire `length` of values. This is
-/// especially useful when `amount <<< length`; e.g. selecting 3 non-repeating
-/// values from `1_000_000`. The algorithm is `O(amount)` time and memory,
-/// but due to overheads will often be slower than other approaches.
+/// Randomly sample exactly `amount` indices from `0..length`, using rejection
+/// sampling.
+/// 
+/// Since `amount <<< length` there is a low chance of a random sample in
+/// `0..length` being a duplicate. We test for duplicates and resample where
+/// necessary. The algorithm is `O(amount)` time and memory.
 #[cfg(feature = "alloc")]
 fn sample_indices_cache<R>(rng: &mut R, length: usize, amount: usize) -> Indices
     where R: Rng + ?Sized,
 {
-    debug_assert!(amount <= length);
-    #[cfg(feature="std")] let mut cache = HashMap::with_capacity(amount);
-    #[cfg(not(feature="std"))] let mut cache = BTreeMap::new();
+    debug_assert!(amount < length);
+    #[cfg(feature="std")] let mut cache = HashSet::with_capacity(amount);
+    #[cfg(not(feature="std"))] let mut cache = BTreeSet::new();
+    let distr = Uniform::new(0, length);
     let mut indices = Vec::with_capacity(amount);
-    for i in 0..amount {
-        let j: usize = rng.gen_range(i, length);
-
-        // get the current values at i and j ...
-        let x_i = match cache.get(&i) {
-            Some(x) => *x,
-            None => i,
-        };
-        let x_j = match cache.get(&j) {
-            Some(x) => *x,
-            None => j,
-        };
-
-        // ... and swap them
-        cache.insert(j, x_i);
-        indices.push(x_j);  // push at position i
+    for _ in 0..amount {
+        let mut pos = distr.sample(rng);
+        while !cache.insert(pos) {
+            pos = distr.sample(rng);
+        }
+        indices.push(pos);
     }
+    
     debug_assert_eq!(indices.len(), amount);
     Indices::from(indices)
 }
@@ -974,9 +965,7 @@ mod test {
         assert_eq!(sample_indices_inplace(&mut r, 1, 0).len(), 0);
         assert_eq!(sample_indices_inplace(&mut r, 1, 1).into_vec_usize(), vec![0]);
 
-        assert_eq!(sample_indices_cache(&mut r, 0, 0).len(), 0);
         assert_eq!(sample_indices_cache(&mut r, 1, 0).len(), 0);
-        assert_eq!(sample_indices_cache(&mut r, 1, 1).into_vec_usize(), vec![0]);
 
         assert_eq!(sample_indices_floyd(&mut r, 0, 0, false).len(), 0);
         assert_eq!(sample_indices_floyd(&mut r, 1, 0, false).len(), 0);

From 91f0af25045fb0ca3b9bb003a91629d2c2643dcc Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Wed, 4 Jul 2018 17:50:30 +0100
Subject: [PATCH 11/14] =?UTF-8?q?sample=5Findices:=20rename=20Indices=20?=
 =?UTF-8?q?=E2=86=92=20IndexVec;=20some=20revisions?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/seq.rs | 160 ++++++++++++++++++++++++++---------------------------
 1 file changed, 79 insertions(+), 81 deletions(-)

diff --git a/src/seq.rs b/src/seq.rs
index e9648181564..3c48a2135d9 100644
--- a/src/seq.rs
+++ b/src/seq.rs
@@ -327,7 +327,7 @@ impl<T> SliceRandom for [T] {
         SliceChooseIter {
             slice: self,
             _phantom: Default::default(),
-            indices: sample_indices(rng, self.len(), amount, shuffled).into_iter_usize(),
+            indices: sample_indices(rng, self.len(), amount, shuffled).into_iter(),
         }
     }
 
@@ -399,7 +399,7 @@ impl<I> IteratorRandom for I where I: Iterator + Sized {}
 pub struct SliceChooseIter<'a, S: ?Sized + 'a, T: 'a> {
     slice: &'a S,
     _phantom: ::core::marker::PhantomData<T>,
-    indices: IndicesIntoIter,
+    indices: IndexVecIntoIter,
 }
 
 #[cfg(feature = "alloc")]
@@ -464,10 +464,10 @@ pub fn sample_slice<R, T>(rng: &mut R, slice: &[T], amount: usize) -> Vec<T>
     where R: Rng + ?Sized,
           T: Clone
 {
-    let indices = sample_indices(rng, slice.len(), amount, true);
+    let indices = sample_indices(rng, slice.len(), amount, true).into_iter();
 
     let mut out = Vec::with_capacity(amount);
-    out.extend(indices.iter_usize().map(|i| slice[i].clone()));
+    out.extend(indices.map(|i| slice[i].clone()));
     out
 }
 
@@ -487,98 +487,111 @@ pub fn sample_slice<R, T>(rng: &mut R, slice: &[T], amount: usize) -> Vec<T>
 pub fn sample_slice_ref<'a, R, T>(rng: &mut R, slice: &'a [T], amount: usize) -> Vec<&'a T>
     where R: Rng + ?Sized
 {
-    let indices = sample_indices(rng, slice.len(), amount, true);
+    let indices = sample_indices(rng, slice.len(), amount, true).into_iter();
 
     let mut out = Vec::with_capacity(amount);
-    out.extend(indices.iter_usize().map(|i| &slice[i]));
+    out.extend(indices.map(|i| &slice[i]));
     out
 }
 
-/// Return type of `sample_indices`.
+/// A vector of indices.
+/// 
+/// Multiple internal representations are possible.
 #[cfg(feature = "alloc")]
 #[derive(Clone, Debug)]
-pub enum Indices {
-    /// Representation: a vector over `u32` values
-    U32(Vec<u32>),
-    /// Representation: a vector over `usize` values
-    USize(Vec<usize>),
+pub enum IndexVec {
+    #[doc(hidden)] U32(Vec<u32>),
+    #[doc(hidden)] USize(Vec<usize>),
 }
 
 #[cfg(feature = "alloc")]
-impl Indices {
+impl IndexVec {
     /// Returns the number of indices
     pub fn len(&self) -> usize {
         match self {
-            &Indices::U32(ref v) => v.len(),
-            &Indices::USize(ref v) => v.len(),
+            &IndexVec::U32(ref v) => v.len(),
+            &IndexVec::USize(ref v) => v.len(),
+        }
+    }
+    
+    /// Return the value at the given `index`.
+    /// 
+    /// (Note: we cannot implement `std::ops::Index` because of lifetime
+    /// restrictions.)
+    pub fn index(&self, index: usize) -> usize {
+        match self {
+            &IndexVec::U32(ref v) => v[index] as usize,
+            &IndexVec::USize(ref v) => v[index],
         }
     }
 
     /// Return result as a `Vec<usize>`. Conversion may or may not be trivial.
-    pub fn into_vec_usize(self) -> Vec<usize> {
+    pub fn into_vec(self) -> Vec<usize> {
         match self {
-            Indices::U32(v) => v.into_iter().map(|i| i as usize).collect(),
-            Indices::USize(v) => v,
+            IndexVec::U32(v) => v.into_iter().map(|i| i as usize).collect(),
+            IndexVec::USize(v) => v,
         }
     }
 
     /// Iterate over the indices as a sequence of `usize` values
-    pub fn iter_usize<'a>(&'a self) -> IndicesIter<'a> {
+    pub fn iter<'a>(&'a self) -> IndexVecIter<'a> {
         match self {
-            &Indices::U32(ref v) => IndicesIter::U32(v.iter()),
-            &Indices::USize(ref v) => IndicesIter::USize(v.iter()),
+            &IndexVec::U32(ref v) => IndexVecIter::U32(v.iter()),
+            &IndexVec::USize(ref v) => IndexVecIter::USize(v.iter()),
         }
     }
     
     /// Convert into an iterator over the indices as a sequence of `usize` values
-    pub fn into_iter_usize(self) -> IndicesIntoIter {
+    pub fn into_iter(self) -> IndexVecIntoIter {
         match self {
-            Indices::U32(v) => IndicesIntoIter::U32(v.into_iter()),
-            Indices::USize(v) => IndicesIntoIter::USize(v.into_iter()),
+            IndexVec::U32(v) => IndexVecIntoIter::U32(v.into_iter()),
+            IndexVec::USize(v) => IndexVecIntoIter::USize(v.into_iter()),
         }
     }
 }
 
 #[cfg(feature = "alloc")]
-impl PartialEq for Indices {
-    fn eq(&self, other: &Indices) -> bool {
-        use self::Indices::*;
+impl PartialEq for IndexVec {
+    fn eq(&self, other: &IndexVec) -> bool {
+        use self::IndexVec::*;
         match (self, other) {
             (&U32(ref v1), &U32(ref v2)) => v1 == v2,
             (&USize(ref v1), &USize(ref v2)) => v1 == v2,
-            (a @ _, b @ _) => (a.len() == b.len()) &&
-                    (a.iter_usize().zip(b.iter_usize()).all(|(x, y)| x == y)),
+            (&U32(ref v1), &USize(ref v2)) => (v1.len() == v2.len())
+                && (v1.iter().zip(v2.iter()).all(|(x, y)| *x as usize == *y)),
+            (&USize(ref v1), &U32(ref v2)) => (v1.len() == v2.len())
+                && (v1.iter().zip(v2.iter()).all(|(x, y)| *x == *y as usize)),
         }
     }
 }
 
 #[cfg(feature = "alloc")]
-impl From<Vec<u32>> for Indices {
+impl From<Vec<u32>> for IndexVec {
     fn from(v: Vec<u32>) -> Self {
-        Indices::U32(v)
+        IndexVec::U32(v)
     }
 }
 
 #[cfg(feature = "alloc")]
-impl From<Vec<usize>> for Indices {
+impl From<Vec<usize>> for IndexVec {
     fn from(v: Vec<usize>) -> Self {
-        Indices::USize(v)
+        IndexVec::USize(v)
     }
 }
 
-/// Return type of `Indices::iter_usize`.
+/// Return type of `IndexVec::iter`.
 #[cfg(feature = "alloc")]
 #[derive(Debug)]
-pub enum IndicesIter<'a> {
+pub enum IndexVecIter<'a> {
     #[doc(hidden)] U32(slice::Iter<'a, u32>),
     #[doc(hidden)] USize(slice::Iter<'a, usize>),
 }
 
 #[cfg(feature = "alloc")]
-impl<'a> Iterator for IndicesIter<'a> {
+impl<'a> Iterator for IndexVecIter<'a> {
     type Item = usize;
     fn next(&mut self) -> Option<usize> {
-        use self::IndicesIter::*;
+        use self::IndexVecIter::*;
         match self {
             &mut U32(ref mut iter) => iter.next().map(|i| *i as usize),
             &mut USize(ref mut iter) => iter.next().cloned(),
@@ -587,36 +600,29 @@ impl<'a> Iterator for IndicesIter<'a> {
     
     fn size_hint(&self) -> (usize, Option<usize>) {
         match self {
-            &IndicesIter::U32(ref v) => v.size_hint(),
-            &IndicesIter::USize(ref v) => v.size_hint(),
+            &IndexVecIter::U32(ref v) => v.size_hint(),
+            &IndexVecIter::USize(ref v) => v.size_hint(),
         }
     }
 }
 
 #[cfg(feature = "alloc")]
-impl<'a> ExactSizeIterator for IndicesIter<'a> {
-    fn len(&self) -> usize {
-        match self {
-            &IndicesIter::U32(ref v) => v.len(),
-            &IndicesIter::USize(ref v) => v.len(),
-        }
-    }
-}
+impl<'a> ExactSizeIterator for IndexVecIter<'a> {}
 
-/// Return type of `Indices::into_iter_usize`.
+/// Return type of `IndexVec::into_iter`.
 #[cfg(feature = "alloc")]
 #[derive(Clone, Debug)]
-pub enum IndicesIntoIter {
+pub enum IndexVecIntoIter {
     #[doc(hidden)] U32(vec::IntoIter<u32>),
     #[doc(hidden)] USize(vec::IntoIter<usize>),
 }
 
 #[cfg(feature = "alloc")]
-impl Iterator for IndicesIntoIter {
+impl Iterator for IndexVecIntoIter {
     type Item = usize;
     
     fn next(&mut self) -> Option<Self::Item> {
-        use self::IndicesIntoIter::*;
+        use self::IndexVecIntoIter::*;
         match self {
             &mut U32(ref mut v) => v.next().map(|i| i as usize),
             &mut USize(ref mut v) => v.next(),
@@ -624,7 +630,7 @@ impl Iterator for IndicesIntoIter {
     }
     
     fn size_hint(&self) -> (usize, Option<usize>) {
-        use self::IndicesIntoIter::*;
+        use self::IndexVecIntoIter::*;
         match self {
             &U32(ref v) => v.size_hint(),
             &USize(ref v) => v.size_hint(),
@@ -633,15 +639,7 @@ impl Iterator for IndicesIntoIter {
 }
 
 #[cfg(feature = "alloc")]
-impl ExactSizeIterator for IndicesIntoIter {
-    fn len(&self) -> usize {
-        use self::IndicesIntoIter::*;
-        match self {
-            &U32(ref v) => v.len(),
-            &USize(ref v) => v.len(),
-        }
-    }
-}
+impl ExactSizeIterator for IndexVecIntoIter {}
 
 
 /// Randomly sample exactly `amount` distinct indices from `0..length`.
@@ -666,7 +664,7 @@ impl ExactSizeIterator for IndicesIntoIter {
 ///
 /// Note that performance is significantly better over `u32` indices than over
 /// `u64` indices. Because of this we hide the underlying type behind an
-/// abstraction, `Indices`.
+/// abstraction, `IndexVec`.
 /// 
 /// If an allocation-free `no_std` function is required, it is suggested
 /// to adapt the internal `sample_indices_floyd` implementation.
@@ -674,7 +672,7 @@ impl ExactSizeIterator for IndicesIntoIter {
 /// Panics if `amount > length`.
 #[cfg(feature = "alloc")]
 pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize,
-    shuffled: bool) -> Indices
+    shuffled: bool) -> IndexVec
     where R: Rng + ?Sized,
 {
     if amount > length {
@@ -725,7 +723,7 @@ pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize,
 ///
 /// This implementation uses `O(amount)` memory and `O(amount^2)` time.
 #[cfg(feature = "alloc")]
-fn sample_indices_floyd<R>(rng: &mut R, length: u32, amount: u32, shuffled: bool) -> Indices
+fn sample_indices_floyd<R>(rng: &mut R, length: u32, amount: u32, shuffled: bool) -> IndexVec
     where R: Rng + ?Sized,
 {
     debug_assert!(amount <= length);
@@ -743,7 +741,7 @@ fn sample_indices_floyd<R>(rng: &mut R, length: u32, amount: u32, shuffled: bool
         // shuffling, but it is slow because it requires arbitrary insertions.
         indices.shuffle(rng);
     }
-    Indices::from(indices)
+    IndexVec::from(indices)
 }
 
 /// Randomly sample exactly `amount` indices from `0..length`, using an inplace
@@ -761,7 +759,7 @@ fn sample_indices_floyd<R>(rng: &mut R, length: u32, amount: u32, shuffled: bool
 /// allocations. Set-up is `O(length)` time and memory and shuffling is
 /// `O(amount)` time.
 #[cfg(feature = "alloc")]
-fn sample_indices_inplace<R>(rng: &mut R, length: u32, amount: u32) -> Indices
+fn sample_indices_inplace<R>(rng: &mut R, length: u32, amount: u32) -> IndexVec
     where R: Rng + ?Sized,
 {
     debug_assert!(amount <= length);
@@ -773,7 +771,7 @@ fn sample_indices_inplace<R>(rng: &mut R, length: u32, amount: u32) -> Indices
     }
     indices.truncate(amount as usize);
     debug_assert_eq!(indices.len(), amount as usize);
-    Indices::from(indices)
+    IndexVec::from(indices)
 }
 
 /// Randomly sample exactly `amount` indices from `0..length`, using rejection
@@ -783,7 +781,7 @@ fn sample_indices_inplace<R>(rng: &mut R, length: u32, amount: u32) -> Indices
 /// `0..length` being a duplicate. We test for duplicates and resample where
 /// necessary. The algorithm is `O(amount)` time and memory.
 #[cfg(feature = "alloc")]
-fn sample_indices_cache<R>(rng: &mut R, length: usize, amount: usize) -> Indices
+fn sample_indices_cache<R>(rng: &mut R, length: usize, amount: usize) -> IndexVec
     where R: Rng + ?Sized,
 {
     debug_assert!(amount < length);
@@ -800,7 +798,7 @@ fn sample_indices_cache<R>(rng: &mut R, length: usize, amount: usize) -> Indices
     }
     
     debug_assert_eq!(indices.len(), amount);
-    Indices::from(indices)
+    IndexVec::from(indices)
 }
 
 #[cfg(test)]
@@ -963,21 +961,21 @@ mod test {
 
         assert_eq!(sample_indices_inplace(&mut r, 0, 0).len(), 0);
         assert_eq!(sample_indices_inplace(&mut r, 1, 0).len(), 0);
-        assert_eq!(sample_indices_inplace(&mut r, 1, 1).into_vec_usize(), vec![0]);
+        assert_eq!(sample_indices_inplace(&mut r, 1, 1).into_vec(), vec![0]);
 
         assert_eq!(sample_indices_cache(&mut r, 1, 0).len(), 0);
 
         assert_eq!(sample_indices_floyd(&mut r, 0, 0, false).len(), 0);
         assert_eq!(sample_indices_floyd(&mut r, 1, 0, false).len(), 0);
-        assert_eq!(sample_indices_floyd(&mut r, 1, 1, false).into_vec_usize(), vec![0]);
+        assert_eq!(sample_indices_floyd(&mut r, 1, 1, false).into_vec(), vec![0]);
         
         // These algorithms should be fast with big numbers. Test average.
-        let indices = sample_indices_cache(&mut r, 1 << 25, 10);
-        let sum: usize = indices.iter_usize().sum();
+        let sum: usize = sample_indices_cache(&mut r, 1 << 25, 10)
+                .into_iter().sum();
         assert!(1 << 25 < sum && sum < (1 << 25) * 25);
         
-        let indices = sample_indices_floyd(&mut r, 1 << 25, 10, false);
-        let sum: usize = indices.iter_usize().sum();
+        let sum: usize = sample_indices_floyd(&mut r, 1 << 25, 10, false)
+                .into_iter().sum();
         assert!(1 << 25 < sum && sum < (1 << 25) * 25);
 
         // Make sure lucky 777's aren't lucky
@@ -1015,15 +1013,15 @@ mod test {
             let regular = sample_indices(
                 &mut xor_rng(seed), length, amount, true);
             assert_eq!(regular.len(), amount);
-            assert!(regular.iter_usize().all(|e| e < length));
+            assert!(regular.iter().all(|e| e < length));
 
             // also test that sampling the slice works
             let vec: Vec<u32> = (0..(length as u32)).collect();
             let result = sample_slice(&mut xor_rng(seed), &vec, amount);
-            assert_eq!(result, regular.iter_usize().map(|i| i as u32).collect::<Vec<_>>());
+            assert_eq!(result, regular.iter().map(|i| i as u32).collect::<Vec<_>>());
 
             let result = sample_slice_ref(&mut xor_rng(seed), &vec, amount);
-            assert!(result.iter().zip(regular.iter_usize()).all(|(i,j)| **i == j as u32));
+            assert!(result.iter().zip(regular.iter()).all(|(i,j)| **i == j as u32));
         }
     }
     
@@ -1044,7 +1042,7 @@ mod test {
         let (length, amount): (usize, usize) = (100, 50);
         let v1 = sample_indices(&mut xor_rng(seed), length, amount, true);
         let v2 = sample_indices_inplace(&mut xor_rng(seed), length as u32, amount as u32);
-        assert!(v1.iter_usize().all(|e| e < length));
+        assert!(v1.iter().all(|e| e < length));
         assert_eq!(v1, v2);
         
         // Test Floyd's alg does produce different results
@@ -1056,7 +1054,7 @@ mod test {
         let (length, amount): (usize, usize) = (1<<20, 50);
         let v1 = sample_indices(&mut xor_rng(seed), length, amount, true);
         let v2 = sample_indices_floyd(&mut xor_rng(seed), length as u32, amount as u32, true);
-        assert!(v1.iter_usize().all(|e| e < length));
+        assert!(v1.iter().all(|e| e < length));
         assert_eq!(v1, v2);
         
         // A large length and larger amount should use cache
@@ -1064,7 +1062,7 @@ mod test {
         let (length, amount): (usize, usize) = (1<<20, 600);
         let v1 = sample_indices(&mut xor_rng(seed), length, amount, true);
         let v2 = sample_indices_cache(&mut xor_rng(seed), length, amount);
-        assert!(v1.iter_usize().all(|e| e < length));
+        assert!(v1.iter().all(|e| e < length));
         assert_eq!(v1, v2);
     }
     

From fb64cf26f78f7f01beba3c242f3ecf21cffa4c46 Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Wed, 4 Jul 2018 18:03:09 +0100
Subject: [PATCH 12/14] sample_indices: add new seq::index module for index
 sampling

---
 benches/seq.rs             |  16 +-
 src/seq/index.rs           | 386 ++++++++++++++++++++++++++++++++++++
 src/{seq.rs => seq/mod.rs} | 392 +------------------------------------
 3 files changed, 404 insertions(+), 390 deletions(-)
 create mode 100644 src/seq/index.rs
 rename src/{seq.rs => seq/mod.rs} (64%)

diff --git a/benches/seq.rs b/benches/seq.rs
index a38ad1148f3..77de182bf0f 100644
--- a/benches/seq.rs
+++ b/benches/seq.rs
@@ -87,16 +87,16 @@ macro_rules! sample_indices {
         fn $name(b: &mut Bencher) {
             let mut rng = SmallRng::from_rng(thread_rng()).unwrap();
             b.iter(|| {
-                $fn(&mut rng, $length, $amount, true)
+                index::$fn(&mut rng, $length, $amount, false)
             })
         }
     }
 }
 
-sample_indices!(misc_sample_indices_1_of_1k, sample_indices, 1, 1000);
-sample_indices!(misc_sample_indices_10_of_1k, sample_indices, 10, 1000);
-sample_indices!(misc_sample_indices_100_of_1k, sample_indices, 100, 1000);
-sample_indices!(misc_sample_indices_100_of_1M, sample_indices, 100, 1000_000);
-sample_indices!(misc_sample_indices_100_of_1G, sample_indices, 100, 1000_000_000);
-sample_indices!(misc_sample_indices_400_of_1G, sample_indices, 400, 1000_000_000);
-sample_indices!(misc_sample_indices_600_of_1G, sample_indices, 600, 1000_000_000);
+sample_indices!(misc_sample_indices_1_of_1k, sample, 1, 1000);
+sample_indices!(misc_sample_indices_10_of_1k, sample, 10, 1000);
+sample_indices!(misc_sample_indices_100_of_1k, sample, 100, 1000);
+sample_indices!(misc_sample_indices_100_of_1M, sample, 100, 1000_000);
+sample_indices!(misc_sample_indices_100_of_1G, sample, 100, 1000_000_000);
+sample_indices!(misc_sample_indices_400_of_1G, sample, 400, 1000_000_000);
+sample_indices!(misc_sample_indices_600_of_1G, sample, 600, 1000_000_000);
diff --git a/src/seq/index.rs b/src/seq/index.rs
new file mode 100644
index 00000000000..aee53c498a2
--- /dev/null
+++ b/src/seq/index.rs
@@ -0,0 +1,386 @@
+// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// https://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Index sampling
+
+#[cfg(feature="alloc")] use core::slice;
+
+#[cfg(feature="std")] use std::vec;
+#[cfg(all(feature="alloc", not(feature="std")))] use alloc::vec::{self, Vec};
+// BTreeMap is not as fast in tests, but better than nothing.
+#[cfg(feature="std")] use std::collections::{HashSet};
+#[cfg(all(feature="alloc", not(feature="std")))] use alloc::collections::BTreeSet;
+
+#[cfg(feature="alloc")] use distributions::{Distribution, Uniform};
+use Rng;
+
+/// A vector of indices.
+/// 
+/// Multiple internal representations are possible.
+#[derive(Clone, Debug)]
+pub enum IndexVec {
+    #[doc(hidden)] U32(Vec<u32>),
+    #[doc(hidden)] USize(Vec<usize>),
+}
+
+impl IndexVec {
+    /// Returns the number of indices
+    pub fn len(&self) -> usize {
+        match self {
+            &IndexVec::U32(ref v) => v.len(),
+            &IndexVec::USize(ref v) => v.len(),
+        }
+    }
+    
+    /// Return the value at the given `index`.
+    /// 
+    /// (Note: we cannot implement `std::ops::Index` because of lifetime
+    /// restrictions.)
+    pub fn index(&self, index: usize) -> usize {
+        match self {
+            &IndexVec::U32(ref v) => v[index] as usize,
+            &IndexVec::USize(ref v) => v[index],
+        }
+    }
+
+    /// Return result as a `Vec<usize>`. Conversion may or may not be trivial.
+    pub fn into_vec(self) -> Vec<usize> {
+        match self {
+            IndexVec::U32(v) => v.into_iter().map(|i| i as usize).collect(),
+            IndexVec::USize(v) => v,
+        }
+    }
+
+    /// Iterate over the indices as a sequence of `usize` values
+    pub fn iter<'a>(&'a self) -> IndexVecIter<'a> {
+        match self {
+            &IndexVec::U32(ref v) => IndexVecIter::U32(v.iter()),
+            &IndexVec::USize(ref v) => IndexVecIter::USize(v.iter()),
+        }
+    }
+    
+    /// Convert into an iterator over the indices as a sequence of `usize` values
+    pub fn into_iter(self) -> IndexVecIntoIter {
+        match self {
+            IndexVec::U32(v) => IndexVecIntoIter::U32(v.into_iter()),
+            IndexVec::USize(v) => IndexVecIntoIter::USize(v.into_iter()),
+        }
+    }
+}
+
+impl PartialEq for IndexVec {
+    fn eq(&self, other: &IndexVec) -> bool {
+        use self::IndexVec::*;
+        match (self, other) {
+            (&U32(ref v1), &U32(ref v2)) => v1 == v2,
+            (&USize(ref v1), &USize(ref v2)) => v1 == v2,
+            (&U32(ref v1), &USize(ref v2)) => (v1.len() == v2.len())
+                && (v1.iter().zip(v2.iter()).all(|(x, y)| *x as usize == *y)),
+            (&USize(ref v1), &U32(ref v2)) => (v1.len() == v2.len())
+                && (v1.iter().zip(v2.iter()).all(|(x, y)| *x == *y as usize)),
+        }
+    }
+}
+
+impl From<Vec<u32>> for IndexVec {
+    fn from(v: Vec<u32>) -> Self {
+        IndexVec::U32(v)
+    }
+}
+
+impl From<Vec<usize>> for IndexVec {
+    fn from(v: Vec<usize>) -> Self {
+        IndexVec::USize(v)
+    }
+}
+
+/// Return type of `IndexVec::iter`.
+#[derive(Debug)]
+pub enum IndexVecIter<'a> {
+    #[doc(hidden)] U32(slice::Iter<'a, u32>),
+    #[doc(hidden)] USize(slice::Iter<'a, usize>),
+}
+
+impl<'a> Iterator for IndexVecIter<'a> {
+    type Item = usize;
+    fn next(&mut self) -> Option<usize> {
+        use self::IndexVecIter::*;
+        match self {
+            &mut U32(ref mut iter) => iter.next().map(|i| *i as usize),
+            &mut USize(ref mut iter) => iter.next().cloned(),
+        }
+    }
+    
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        match self {
+            &IndexVecIter::U32(ref v) => v.size_hint(),
+            &IndexVecIter::USize(ref v) => v.size_hint(),
+        }
+    }
+}
+
+impl<'a> ExactSizeIterator for IndexVecIter<'a> {}
+
+/// Return type of `IndexVec::into_iter`.
+#[derive(Clone, Debug)]
+pub enum IndexVecIntoIter {
+    #[doc(hidden)] U32(vec::IntoIter<u32>),
+    #[doc(hidden)] USize(vec::IntoIter<usize>),
+}
+
+impl Iterator for IndexVecIntoIter {
+    type Item = usize;
+    
+    fn next(&mut self) -> Option<Self::Item> {
+        use self::IndexVecIntoIter::*;
+        match self {
+            &mut U32(ref mut v) => v.next().map(|i| i as usize),
+            &mut USize(ref mut v) => v.next(),
+        }
+    }
+    
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        use self::IndexVecIntoIter::*;
+        match self {
+            &U32(ref v) => v.size_hint(),
+            &USize(ref v) => v.size_hint(),
+        }
+    }
+}
+
+impl ExactSizeIterator for IndexVecIntoIter {}
+
+
+/// Randomly sample exactly `amount` distinct indices from `0..length`.
+///
+/// If `shuffled == true` then the sampled values will be fully shuffled;
+/// otherwise the values may only partially shuffled, depending on the
+/// algorithm used (i.e. biases may exist in the ordering of sampled elements).
+/// Depending on the algorithm used internally, full shuffling may add
+/// significant overhead for `amount` > 10 or so, but not more than double
+/// the time and often much less.
+///
+/// This method is used internally by the slice sampling methods, but it can
+/// sometimes be useful to have the indices themselves so this is provided as
+/// an alternative.
+///
+/// The implementation used is not specified; we automatically select the
+/// fastest available implementation for the `length` and `amount` parameters
+/// (based on detailed profiling on an Intel Haswell CPU). Roughly speaking,
+/// complexity is `O(amount)`, except that when `amount` is small, performance
+/// is closer to `O(amount^2)`, and when `length` is close to `amount` then
+/// `O(length)`.
+///
+/// Note that performance is significantly better over `u32` indices than over
+/// `u64` indices. Because of this we hide the underlying type behind an
+/// abstraction, `IndexVec`.
+/// 
+/// If an allocation-free `no_std` function is required, it is suggested
+/// to adapt the internal `sample_floyd` implementation.
+///
+/// Panics if `amount > length`.
+pub fn sample<R>(rng: &mut R, length: usize, amount: usize,
+    shuffled: bool) -> IndexVec
+    where R: Rng + ?Sized,
+{
+    if amount > length {
+        panic!("`amount` of samples must be less than or equal to `length`");
+    }
+    if length > (::core::u32::MAX as usize) {
+        // We never want to use inplace here, but could use floyd's alg
+        // Lazy version: always use the cache alg.
+        return sample_rejection(rng, length, amount);
+    }
+    let amount = amount as u32;
+    let length = length as u32;
+    
+    // Choice of algorithm here depends on both length and amount. See:
+    // https://github.com/rust-lang-nursery/rand/pull/479
+    // We do some calculations with f32. Accuracy is not very important.
+
+    if amount < 442 {
+        const C: [[f32; 2]; 2] = [[1.2, 6.0/45.0], [10.0, 70.0/9.0]];
+        let j = if length < 500_000 { 0 } else { 1 };
+        let amount_fp = amount as f32;
+        let m4 = C[0][j] * amount_fp;
+        // Short-cut: when amount < 12, floyd's is always faster
+        if amount > 11 && (length as f32) < (C[1][j] + m4) * amount_fp {
+            sample_inplace(rng, length, amount)
+        } else {
+            sample_floyd(rng, length, amount, shuffled)
+        }
+    } else {
+        const C: [f32; 2] = [590.0, 600.0/9.0];
+        let j = if length < 500_000 { 0 } else { 1 };
+        if (length as f32) < C[j] * (amount as f32) {
+            sample_inplace(rng, length, amount)
+        } else {
+            // note: could have a specific u32 impl, but I'm lazy and
+            // generics don't have usable conversions
+            sample_rejection(rng, length as usize, amount as usize)
+        }
+    }
+}
+
+/// Randomly sample exactly `amount` indices from `0..length`, using Floyd's
+/// combination algorithm.
+/// 
+/// If `shuffled == false`, the values are only partially shuffled (i.e. biases
+/// exist in the ordering of sampled elements). If `shuffled == true`, the
+/// values are fully shuffled.
+///
+/// This implementation uses `O(amount)` memory and `O(amount^2)` time.
+fn sample_floyd<R>(rng: &mut R, length: u32, amount: u32, shuffled: bool) -> IndexVec
+    where R: Rng + ?Sized,
+{
+    debug_assert!(amount <= length);
+    let mut indices = Vec::with_capacity(amount as usize);
+    for j in length - amount .. length {
+        let t = rng.gen_range(0, j + 1);
+        if indices.contains(&t) {
+            indices.push(j)
+        } else {
+            indices.push(t)
+        };
+    }
+    if shuffled {
+        // Note that there is a variant of Floyd's algorithm with native full
+        // shuffling, but it is slow because it requires arbitrary insertions.
+        use super::SliceRandom;
+        indices.shuffle(rng);
+    }
+    IndexVec::from(indices)
+}
+
+/// Randomly sample exactly `amount` indices from `0..length`, using an inplace
+/// partial Fisher-Yates method.
+/// Sample an amount of indices using an inplace partial fisher yates method.
+///
+/// This allocates the entire `length` of indices and randomizes only the first `amount`.
+/// It then truncates to `amount` and returns.
+/// 
+/// This method is not appropriate for large `length` and potentially uses a lot
+/// of memory; because of this we only implement for `u32` index (which improves
+/// performance in all cases).
+///
+/// This is likely the fastest for small lengths since it avoids the need for
+/// allocations. Set-up is `O(length)` time and memory and shuffling is
+/// `O(amount)` time.
+fn sample_inplace<R>(rng: &mut R, length: u32, amount: u32) -> IndexVec
+    where R: Rng + ?Sized,
+{
+    debug_assert!(amount <= length);
+    let mut indices: Vec<u32> = Vec::with_capacity(length as usize);
+    indices.extend(0..length);
+    for i in 0..amount {
+        let j: u32 = rng.gen_range(i, length);
+        indices.swap(i as usize, j as usize);
+    }
+    indices.truncate(amount as usize);
+    debug_assert_eq!(indices.len(), amount as usize);
+    IndexVec::from(indices)
+}
+
+/// Randomly sample exactly `amount` indices from `0..length`, using rejection
+/// sampling.
+/// 
+/// Since `amount <<< length` there is a low chance of a random sample in
+/// `0..length` being a duplicate. We test for duplicates and resample where
+/// necessary. The algorithm is `O(amount)` time and memory.
+fn sample_rejection<R>(rng: &mut R, length: usize, amount: usize) -> IndexVec
+    where R: Rng + ?Sized,
+{
+    debug_assert!(amount < length);
+    #[cfg(feature="std")] let mut cache = HashSet::with_capacity(amount);
+    #[cfg(not(feature="std"))] let mut cache = BTreeSet::new();
+    let distr = Uniform::new(0, length);
+    let mut indices = Vec::with_capacity(amount);
+    for _ in 0..amount {
+        let mut pos = distr.sample(rng);
+        while !cache.insert(pos) {
+            pos = distr.sample(rng);
+        }
+        indices.push(pos);
+    }
+    
+    debug_assert_eq!(indices.len(), amount);
+    IndexVec::from(indices)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use {Rng, SeedableRng};
+    use prng::XorShiftRng;
+    
+    #[test]
+    fn test_sample_boundaries() {
+        let mut r = ::test::rng(404);
+        
+        assert_eq!(sample_inplace(&mut r, 0, 0).len(), 0);
+        assert_eq!(sample_inplace(&mut r, 1, 0).len(), 0);
+        assert_eq!(sample_inplace(&mut r, 1, 1).into_vec(), vec![0]);
+
+        assert_eq!(sample_rejection(&mut r, 1, 0).len(), 0);
+
+        assert_eq!(sample_floyd(&mut r, 0, 0, false).len(), 0);
+        assert_eq!(sample_floyd(&mut r, 1, 0, false).len(), 0);
+        assert_eq!(sample_floyd(&mut r, 1, 1, false).into_vec(), vec![0]);
+        
+        // These algorithms should be fast with big numbers. Test average.
+        let sum: usize = sample_rejection(&mut r, 1 << 25, 10)
+                .into_iter().sum();
+        assert!(1 << 25 < sum && sum < (1 << 25) * 25);
+        
+        let sum: usize = sample_floyd(&mut r, 1 << 25, 10, false)
+                .into_iter().sum();
+        assert!(1 << 25 < sum && sum < (1 << 25) * 25);
+    }
+    
+    #[test]
+    fn test_sample_alg() {
+        let xor_rng = XorShiftRng::from_seed;
+
+        let mut r = ::test::rng(403);
+        let mut seed = [0u8; 16];
+        
+        // We can't test which algorithm is used directly, but Floyd's alg
+        // should produce different results from the others. (Also, `inplace`
+        // and `cached` currently use different sizes thus produce different results.)
+        
+        // A small length and relatively large amount should use inplace
+        r.fill(&mut seed);
+        let (length, amount): (usize, usize) = (100, 50);
+        let v1 = sample(&mut xor_rng(seed), length, amount, true);
+        let v2 = sample_inplace(&mut xor_rng(seed), length as u32, amount as u32);
+        assert!(v1.iter().all(|e| e < length));
+        assert_eq!(v1, v2);
+        
+        // Test Floyd's alg does produce different results
+        let v3 = sample_floyd(&mut xor_rng(seed), length as u32, amount as u32, true);
+        assert!(v1 != v3);
+        
+        // A large length and small amount should use Floyd
+        r.fill(&mut seed);
+        let (length, amount): (usize, usize) = (1<<20, 50);
+        let v1 = sample(&mut xor_rng(seed), length, amount, true);
+        let v2 = sample_floyd(&mut xor_rng(seed), length as u32, amount as u32, true);
+        assert!(v1.iter().all(|e| e < length));
+        assert_eq!(v1, v2);
+        
+        // A large length and larger amount should use cache
+        r.fill(&mut seed);
+        let (length, amount): (usize, usize) = (1<<20, 600);
+        let v1 = sample(&mut xor_rng(seed), length, amount, true);
+        let v2 = sample_rejection(&mut xor_rng(seed), length, amount);
+        assert!(v1.iter().all(|e| e < length));
+        assert_eq!(v1, v2);
+    }
+}
diff --git a/src/seq.rs b/src/seq/mod.rs
similarity index 64%
rename from src/seq.rs
rename to src/seq/mod.rs
index 3c48a2135d9..830ddcaf6ea 100644
--- a/src/seq.rs
+++ b/src/seq/mod.rs
@@ -12,18 +12,14 @@
 //! 
 //! TODO: module doc
 
+
+#[cfg(feature="alloc")] pub mod index;
+
 #[cfg(feature="alloc")] use core::ops::Index;
-#[cfg(feature="alloc")] use core::slice;
 
-#[cfg(feature="std")] use std::vec;
-#[cfg(all(feature="alloc", not(feature="std")))] use alloc::vec;
 #[cfg(all(feature="alloc", not(feature="std")))] use alloc::vec::Vec;
-// BTreeMap is not as fast in tests, but better than nothing.
-#[cfg(feature="std")] use std::collections::{HashSet};
-#[cfg(all(feature="alloc", not(feature="std")))] use alloc::collections::BTreeSet;
 
 use Rng;
-#[cfg(feature="alloc")] use distributions::{Distribution, Uniform};
 #[cfg(feature="alloc")] use distributions::WeightedError;
 #[cfg(feature="alloc")] use distributions::uniform::{SampleUniform, SampleBorrow};
 
@@ -64,7 +60,7 @@ pub trait SliceRandom {
     /// Produces an iterator that chooses `amount` elements from the slice at
     /// random without repeating any.
     ///
-    /// In case this API is not sufficiently flexible, use `sample_indices` then
+    /// In case this API is not sufficiently flexible, use `index::sample` then
     /// apply the indices to the slice.
     /// 
     /// If `shuffled == true` then the sampled values will be fully shuffled;
@@ -74,7 +70,7 @@ pub trait SliceRandom {
     /// may add significant overhead for `amount` > 10 or so, but not more
     /// than double the time and often much less.
     ///
-    /// Complexity is expected to be the same as `sample_indices`.
+    /// Complexity is expected to be the same as `index::sample`.
     /// 
     /// # Example
     /// ```
@@ -327,7 +323,7 @@ impl<T> SliceRandom for [T] {
         SliceChooseIter {
             slice: self,
             _phantom: Default::default(),
-            indices: sample_indices(rng, self.len(), amount, shuffled).into_iter(),
+            indices: index::sample(rng, self.len(), amount, shuffled).into_iter(),
         }
     }
 
@@ -399,7 +395,7 @@ impl<I> IteratorRandom for I where I: Iterator + Sized {}
 pub struct SliceChooseIter<'a, S: ?Sized + 'a, T: 'a> {
     slice: &'a S,
     _phantom: ::core::marker::PhantomData<T>,
-    indices: IndexVecIntoIter,
+    indices: index::IndexVecIntoIter,
 }
 
 #[cfg(feature = "alloc")]
@@ -464,7 +460,7 @@ pub fn sample_slice<R, T>(rng: &mut R, slice: &[T], amount: usize) -> Vec<T>
     where R: Rng + ?Sized,
           T: Clone
 {
-    let indices = sample_indices(rng, slice.len(), amount, true).into_iter();
+    let indices = index::sample(rng, slice.len(), amount, true).into_iter();
 
     let mut out = Vec::with_capacity(amount);
     out.extend(indices.map(|i| slice[i].clone()));
@@ -487,320 +483,13 @@ pub fn sample_slice<R, T>(rng: &mut R, slice: &[T], amount: usize) -> Vec<T>
 pub fn sample_slice_ref<'a, R, T>(rng: &mut R, slice: &'a [T], amount: usize) -> Vec<&'a T>
     where R: Rng + ?Sized
 {
-    let indices = sample_indices(rng, slice.len(), amount, true).into_iter();
+    let indices = index::sample(rng, slice.len(), amount, true).into_iter();
 
     let mut out = Vec::with_capacity(amount);
     out.extend(indices.map(|i| &slice[i]));
     out
 }
 
-/// A vector of indices.
-/// 
-/// Multiple internal representations are possible.
-#[cfg(feature = "alloc")]
-#[derive(Clone, Debug)]
-pub enum IndexVec {
-    #[doc(hidden)] U32(Vec<u32>),
-    #[doc(hidden)] USize(Vec<usize>),
-}
-
-#[cfg(feature = "alloc")]
-impl IndexVec {
-    /// Returns the number of indices
-    pub fn len(&self) -> usize {
-        match self {
-            &IndexVec::U32(ref v) => v.len(),
-            &IndexVec::USize(ref v) => v.len(),
-        }
-    }
-    
-    /// Return the value at the given `index`.
-    /// 
-    /// (Note: we cannot implement `std::ops::Index` because of lifetime
-    /// restrictions.)
-    pub fn index(&self, index: usize) -> usize {
-        match self {
-            &IndexVec::U32(ref v) => v[index] as usize,
-            &IndexVec::USize(ref v) => v[index],
-        }
-    }
-
-    /// Return result as a `Vec<usize>`. Conversion may or may not be trivial.
-    pub fn into_vec(self) -> Vec<usize> {
-        match self {
-            IndexVec::U32(v) => v.into_iter().map(|i| i as usize).collect(),
-            IndexVec::USize(v) => v,
-        }
-    }
-
-    /// Iterate over the indices as a sequence of `usize` values
-    pub fn iter<'a>(&'a self) -> IndexVecIter<'a> {
-        match self {
-            &IndexVec::U32(ref v) => IndexVecIter::U32(v.iter()),
-            &IndexVec::USize(ref v) => IndexVecIter::USize(v.iter()),
-        }
-    }
-    
-    /// Convert into an iterator over the indices as a sequence of `usize` values
-    pub fn into_iter(self) -> IndexVecIntoIter {
-        match self {
-            IndexVec::U32(v) => IndexVecIntoIter::U32(v.into_iter()),
-            IndexVec::USize(v) => IndexVecIntoIter::USize(v.into_iter()),
-        }
-    }
-}
-
-#[cfg(feature = "alloc")]
-impl PartialEq for IndexVec {
-    fn eq(&self, other: &IndexVec) -> bool {
-        use self::IndexVec::*;
-        match (self, other) {
-            (&U32(ref v1), &U32(ref v2)) => v1 == v2,
-            (&USize(ref v1), &USize(ref v2)) => v1 == v2,
-            (&U32(ref v1), &USize(ref v2)) => (v1.len() == v2.len())
-                && (v1.iter().zip(v2.iter()).all(|(x, y)| *x as usize == *y)),
-            (&USize(ref v1), &U32(ref v2)) => (v1.len() == v2.len())
-                && (v1.iter().zip(v2.iter()).all(|(x, y)| *x == *y as usize)),
-        }
-    }
-}
-
-#[cfg(feature = "alloc")]
-impl From<Vec<u32>> for IndexVec {
-    fn from(v: Vec<u32>) -> Self {
-        IndexVec::U32(v)
-    }
-}
-
-#[cfg(feature = "alloc")]
-impl From<Vec<usize>> for IndexVec {
-    fn from(v: Vec<usize>) -> Self {
-        IndexVec::USize(v)
-    }
-}
-
-/// Return type of `IndexVec::iter`.
-#[cfg(feature = "alloc")]
-#[derive(Debug)]
-pub enum IndexVecIter<'a> {
-    #[doc(hidden)] U32(slice::Iter<'a, u32>),
-    #[doc(hidden)] USize(slice::Iter<'a, usize>),
-}
-
-#[cfg(feature = "alloc")]
-impl<'a> Iterator for IndexVecIter<'a> {
-    type Item = usize;
-    fn next(&mut self) -> Option<usize> {
-        use self::IndexVecIter::*;
-        match self {
-            &mut U32(ref mut iter) => iter.next().map(|i| *i as usize),
-            &mut USize(ref mut iter) => iter.next().cloned(),
-        }
-    }
-    
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        match self {
-            &IndexVecIter::U32(ref v) => v.size_hint(),
-            &IndexVecIter::USize(ref v) => v.size_hint(),
-        }
-    }
-}
-
-#[cfg(feature = "alloc")]
-impl<'a> ExactSizeIterator for IndexVecIter<'a> {}
-
-/// Return type of `IndexVec::into_iter`.
-#[cfg(feature = "alloc")]
-#[derive(Clone, Debug)]
-pub enum IndexVecIntoIter {
-    #[doc(hidden)] U32(vec::IntoIter<u32>),
-    #[doc(hidden)] USize(vec::IntoIter<usize>),
-}
-
-#[cfg(feature = "alloc")]
-impl Iterator for IndexVecIntoIter {
-    type Item = usize;
-    
-    fn next(&mut self) -> Option<Self::Item> {
-        use self::IndexVecIntoIter::*;
-        match self {
-            &mut U32(ref mut v) => v.next().map(|i| i as usize),
-            &mut USize(ref mut v) => v.next(),
-        }
-    }
-    
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        use self::IndexVecIntoIter::*;
-        match self {
-            &U32(ref v) => v.size_hint(),
-            &USize(ref v) => v.size_hint(),
-        }
-    }
-}
-
-#[cfg(feature = "alloc")]
-impl ExactSizeIterator for IndexVecIntoIter {}
-
-
-/// Randomly sample exactly `amount` distinct indices from `0..length`.
-///
-/// If `shuffled == true` then the sampled values will be fully shuffled;
-/// otherwise the values may only partially shuffled, depending on the
-/// algorithm used (i.e. biases may exist in the ordering of sampled elements).
-/// Depending on the algorithm used internally, full shuffling may add
-/// significant overhead for `amount` > 10 or so, but not more than double
-/// the time and often much less.
-///
-/// This method is used internally by the slice sampling methods, but it can
-/// sometimes be useful to have the indices themselves so this is provided as
-/// an alternative.
-///
-/// The implementation used is not specified; we automatically select the
-/// fastest available implementation for the `length` and `amount` parameters
-/// (based on detailed profiling on an Intel Haswell CPU). Roughly speaking,
-/// complexity is `O(amount)`, except that when `amount` is small, performance
-/// is closer to `O(amount^2)`, and when `length` is close to `amount` then
-/// `O(length)`.
-///
-/// Note that performance is significantly better over `u32` indices than over
-/// `u64` indices. Because of this we hide the underlying type behind an
-/// abstraction, `IndexVec`.
-/// 
-/// If an allocation-free `no_std` function is required, it is suggested
-/// to adapt the internal `sample_indices_floyd` implementation.
-///
-/// Panics if `amount > length`.
-#[cfg(feature = "alloc")]
-pub fn sample_indices<R>(rng: &mut R, length: usize, amount: usize,
-    shuffled: bool) -> IndexVec
-    where R: Rng + ?Sized,
-{
-    if amount > length {
-        panic!("`amount` of samples must be less than or equal to `length`");
-    }
-    if length > (::core::u32::MAX as usize) {
-        // We never want to use inplace here, but could use floyd's alg
-        // Lazy version: always use the cache alg.
-        return sample_indices_cache(rng, length, amount);
-    }
-    let amount = amount as u32;
-    let length = length as u32;
-    
-    // Choice of algorithm here depends on both length and amount. See:
-    // https://github.com/rust-lang-nursery/rand/pull/479
-    // We do some calculations with f32. Accuracy is not very important.
-
-    if amount < 442 {
-        const C: [[f32; 2]; 2] = [[1.2, 6.0/45.0], [10.0, 70.0/9.0]];
-        let j = if length < 500_000 { 0 } else { 1 };
-        let amount_fp = amount as f32;
-        let m4 = C[0][j] * amount_fp;
-        // Short-cut: when amount < 12, floyd's is always faster
-        if amount > 11 && (length as f32) < (C[1][j] + m4) * amount_fp {
-            sample_indices_inplace(rng, length, amount)
-        } else {
-            sample_indices_floyd(rng, length, amount, shuffled)
-        }
-    } else {
-        const C: [f32; 2] = [590.0, 600.0/9.0];
-        let j = if length < 500_000 { 0 } else { 1 };
-        if (length as f32) < C[j] * (amount as f32) {
-            sample_indices_inplace(rng, length, amount)
-        } else {
-            // note: could have a specific u32 impl, but I'm lazy and
-            // generics don't have usable conversions
-            sample_indices_cache(rng, length as usize, amount as usize)
-        }
-    }
-}
-
-/// Randomly sample exactly `amount` indices from `0..length`, using Floyd's
-/// combination algorithm.
-/// 
-/// If `shuffled == false`, the values are only partially shuffled (i.e. biases
-/// exist in the ordering of sampled elements). If `shuffled == true`, the
-/// values are fully shuffled.
-///
-/// This implementation uses `O(amount)` memory and `O(amount^2)` time.
-#[cfg(feature = "alloc")]
-fn sample_indices_floyd<R>(rng: &mut R, length: u32, amount: u32, shuffled: bool) -> IndexVec
-    where R: Rng + ?Sized,
-{
-    debug_assert!(amount <= length);
-    let mut indices = Vec::with_capacity(amount as usize);
-    for j in length - amount .. length {
-        let t = rng.gen_range(0, j + 1);
-        if indices.contains(&t) {
-            indices.push(j)
-        } else {
-            indices.push(t)
-        };
-    }
-    if shuffled {
-        // Note that there is a variant of Floyd's algorithm with native full
-        // shuffling, but it is slow because it requires arbitrary insertions.
-        indices.shuffle(rng);
-    }
-    IndexVec::from(indices)
-}
-
-/// Randomly sample exactly `amount` indices from `0..length`, using an inplace
-/// partial Fisher-Yates method.
-/// Sample an amount of indices using an inplace partial fisher yates method.
-///
-/// This allocates the entire `length` of indices and randomizes only the first `amount`.
-/// It then truncates to `amount` and returns.
-/// 
-/// This method is not appropriate for large `length` and potentially uses a lot
-/// of memory; because of this we only implement for `u32` index (which improves
-/// performance in all cases).
-///
-/// This is likely the fastest for small lengths since it avoids the need for
-/// allocations. Set-up is `O(length)` time and memory and shuffling is
-/// `O(amount)` time.
-#[cfg(feature = "alloc")]
-fn sample_indices_inplace<R>(rng: &mut R, length: u32, amount: u32) -> IndexVec
-    where R: Rng + ?Sized,
-{
-    debug_assert!(amount <= length);
-    let mut indices: Vec<u32> = Vec::with_capacity(length as usize);
-    indices.extend(0..length);
-    for i in 0..amount {
-        let j: u32 = rng.gen_range(i, length);
-        indices.swap(i as usize, j as usize);
-    }
-    indices.truncate(amount as usize);
-    debug_assert_eq!(indices.len(), amount as usize);
-    IndexVec::from(indices)
-}
-
-/// Randomly sample exactly `amount` indices from `0..length`, using rejection
-/// sampling.
-/// 
-/// Since `amount <<< length` there is a low chance of a random sample in
-/// `0..length` being a duplicate. We test for duplicates and resample where
-/// necessary. The algorithm is `O(amount)` time and memory.
-#[cfg(feature = "alloc")]
-fn sample_indices_cache<R>(rng: &mut R, length: usize, amount: usize) -> IndexVec
-    where R: Rng + ?Sized,
-{
-    debug_assert!(amount < length);
-    #[cfg(feature="std")] let mut cache = HashSet::with_capacity(amount);
-    #[cfg(not(feature="std"))] let mut cache = BTreeSet::new();
-    let distr = Uniform::new(0, length);
-    let mut indices = Vec::with_capacity(amount);
-    for _ in 0..amount {
-        let mut pos = distr.sample(rng);
-        while !cache.insert(pos) {
-            pos = distr.sample(rng);
-        }
-        indices.push(pos);
-    }
-    
-    debug_assert_eq!(indices.len(), amount);
-    IndexVec::from(indices)
-}
-
 #[cfg(test)]
 mod test {
     use super::*;
@@ -855,7 +544,6 @@ mod test {
 
     #[test]
     fn test_shuffle() {
-
         let mut r = ::test::rng(108);
         let empty: &mut [isize] = &mut [];
         empty.shuffle(&mut r);
@@ -959,25 +647,6 @@ mod test {
         let v = sample_slice(&mut r, &[42, 133], 2);
         assert!(&v[..] == [42, 133] || v[..] == [133, 42]);
 
-        assert_eq!(sample_indices_inplace(&mut r, 0, 0).len(), 0);
-        assert_eq!(sample_indices_inplace(&mut r, 1, 0).len(), 0);
-        assert_eq!(sample_indices_inplace(&mut r, 1, 1).into_vec(), vec![0]);
-
-        assert_eq!(sample_indices_cache(&mut r, 1, 0).len(), 0);
-
-        assert_eq!(sample_indices_floyd(&mut r, 0, 0, false).len(), 0);
-        assert_eq!(sample_indices_floyd(&mut r, 1, 0, false).len(), 0);
-        assert_eq!(sample_indices_floyd(&mut r, 1, 1, false).into_vec(), vec![0]);
-        
-        // These algorithms should be fast with big numbers. Test average.
-        let sum: usize = sample_indices_cache(&mut r, 1 << 25, 10)
-                .into_iter().sum();
-        assert!(1 << 25 < sum && sum < (1 << 25) * 25);
-        
-        let sum: usize = sample_indices_floyd(&mut r, 1 << 25, 10, false)
-                .into_iter().sum();
-        assert!(1 << 25 < sum && sum < (1 << 25) * 25);
-
         // Make sure lucky 777's aren't lucky
         let slice = &[42, 777];
         let mut num_42 = 0;
@@ -1010,7 +679,7 @@ mod test {
             r.fill(&mut seed);
 
             // assert the basics work
-            let regular = sample_indices(
+            let regular = index::sample(
                 &mut xor_rng(seed), length, amount, true);
             assert_eq!(regular.len(), amount);
             assert!(regular.iter().all(|e| e < length));
@@ -1025,47 +694,6 @@ mod test {
         }
     }
     
-    #[test]
-    #[cfg(feature = "alloc")]
-    fn test_sample_alg() {
-        let xor_rng = XorShiftRng::from_seed;
-
-        let mut r = ::test::rng(403);
-        let mut seed = [0u8; 16];
-        
-        // We can't test which algorithm is used directly, but Floyd's alg
-        // should produce different results from the others. (Also, `inplace`
-        // and `cached` currently use different sizes thus produce different results.)
-        
-        // A small length and relatively large amount should use inplace
-        r.fill(&mut seed);
-        let (length, amount): (usize, usize) = (100, 50);
-        let v1 = sample_indices(&mut xor_rng(seed), length, amount, true);
-        let v2 = sample_indices_inplace(&mut xor_rng(seed), length as u32, amount as u32);
-        assert!(v1.iter().all(|e| e < length));
-        assert_eq!(v1, v2);
-        
-        // Test Floyd's alg does produce different results
-        let v3 = sample_indices_floyd(&mut xor_rng(seed), length as u32, amount as u32, true);
-        assert!(v1 != v3);
-        
-        // A large length and small amount should use Floyd
-        r.fill(&mut seed);
-        let (length, amount): (usize, usize) = (1<<20, 50);
-        let v1 = sample_indices(&mut xor_rng(seed), length, amount, true);
-        let v2 = sample_indices_floyd(&mut xor_rng(seed), length as u32, amount as u32, true);
-        assert!(v1.iter().all(|e| e < length));
-        assert_eq!(v1, v2);
-        
-        // A large length and larger amount should use cache
-        r.fill(&mut seed);
-        let (length, amount): (usize, usize) = (1<<20, 600);
-        let v1 = sample_indices(&mut xor_rng(seed), length, amount, true);
-        let v2 = sample_indices_cache(&mut xor_rng(seed), length, amount);
-        assert!(v1.iter().all(|e| e < length));
-        assert_eq!(v1, v2);
-    }
-    
     #[test]
     #[cfg(feature = "alloc")]
     fn test_weighted() {

From 805022c0964c0e740f112578e69b68bb970cc03c Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Thu, 5 Jul 2018 09:10:12 +0100
Subject: [PATCH 13/14] sample_indices: update model parameters

This accounts for the "cache" method being replaced by
rejection sampling and now using usize again.
---
 src/seq/index.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/seq/index.rs b/src/seq/index.rs
index aee53c498a2..29953704337 100644
--- a/src/seq/index.rs
+++ b/src/seq/index.rs
@@ -205,7 +205,7 @@ pub fn sample<R>(rng: &mut R, length: usize, amount: usize,
     // https://github.com/rust-lang-nursery/rand/pull/479
     // We do some calculations with f32. Accuracy is not very important.
 
-    if amount < 442 {
+    if amount < 217 {
         const C: [[f32; 2]; 2] = [[1.2, 6.0/45.0], [10.0, 70.0/9.0]];
         let j = if length < 500_000 { 0 } else { 1 };
         let amount_fp = amount as f32;
@@ -217,7 +217,7 @@ pub fn sample<R>(rng: &mut R, length: usize, amount: usize,
             sample_floyd(rng, length, amount, shuffled)
         }
     } else {
-        const C: [f32; 2] = [590.0, 600.0/9.0];
+        const C: [f32; 2] = [270.0, 330.0/9.0];
         let j = if length < 500_000 { 0 } else { 1 };
         if (length as f32) < C[j] * (amount as f32) {
             sample_inplace(rng, length, amount)

From 19897e53c1908c5a193e2aacff59170e3e72b8de Mon Sep 17 00:00:00 2001
From: Diggory Hardy <git@dhardy.name>
Date: Thu, 5 Jul 2018 09:31:00 +0100
Subject: [PATCH 14/14] sample_indices: always shuffle. Floyd's alg: optimise.

---
 benches/seq.rs   |  5 +--
 src/seq/index.rs | 90 +++++++++++++++++++++++++++---------------------
 src/seq/mod.rs   | 28 ++++++---------
 3 files changed, 64 insertions(+), 59 deletions(-)

diff --git a/benches/seq.rs b/benches/seq.rs
index 77de182bf0f..f143131763b 100644
--- a/benches/seq.rs
+++ b/benches/seq.rs
@@ -39,7 +39,7 @@ macro_rules! seq_slice_choose_multiple {
                 // Collect full result to prevent unwanted shortcuts getting
                 // first element (in case sample_indices returns an iterator).
                 for (slot, sample) in result.iter_mut().zip(
-                    x.choose_multiple(&mut rng, $amount, false)) {
+                    x.choose_multiple(&mut rng, $amount)) {
                     *slot = *sample;
                 }
                 result[$amount-1]
@@ -87,7 +87,7 @@ macro_rules! sample_indices {
         fn $name(b: &mut Bencher) {
             let mut rng = SmallRng::from_rng(thread_rng()).unwrap();
             b.iter(|| {
-                index::$fn(&mut rng, $length, $amount, false)
+                index::$fn(&mut rng, $length, $amount)
             })
         }
     }
@@ -98,5 +98,6 @@ sample_indices!(misc_sample_indices_10_of_1k, sample, 10, 1000);
 sample_indices!(misc_sample_indices_100_of_1k, sample, 100, 1000);
 sample_indices!(misc_sample_indices_100_of_1M, sample, 100, 1000_000);
 sample_indices!(misc_sample_indices_100_of_1G, sample, 100, 1000_000_000);
+sample_indices!(misc_sample_indices_200_of_1G, sample, 200, 1000_000_000);
 sample_indices!(misc_sample_indices_400_of_1G, sample, 400, 1000_000_000);
 sample_indices!(misc_sample_indices_600_of_1G, sample, 600, 1000_000_000);
diff --git a/src/seq/index.rs b/src/seq/index.rs
index 29953704337..805b7f3c2f3 100644
--- a/src/seq/index.rs
+++ b/src/seq/index.rs
@@ -158,21 +158,15 @@ impl Iterator for IndexVecIntoIter {
 impl ExactSizeIterator for IndexVecIntoIter {}
 
 
-/// Randomly sample exactly `amount` distinct indices from `0..length`.
-///
-/// If `shuffled == true` then the sampled values will be fully shuffled;
-/// otherwise the values may only partially shuffled, depending on the
-/// algorithm used (i.e. biases may exist in the ordering of sampled elements).
-/// Depending on the algorithm used internally, full shuffling may add
-/// significant overhead for `amount` > 10 or so, but not more than double
-/// the time and often much less.
+/// Randomly sample exactly `amount` distinct indices from `0..length`, and
+/// return them in random order (fully shuffled).
 ///
 /// This method is used internally by the slice sampling methods, but it can
 /// sometimes be useful to have the indices themselves so this is provided as
 /// an alternative.
 ///
 /// The implementation used is not specified; we automatically select the
-/// fastest available implementation for the `length` and `amount` parameters
+/// fastest available algorithm for the `length` and `amount` parameters
 /// (based on detailed profiling on an Intel Haswell CPU). Roughly speaking,
 /// complexity is `O(amount)`, except that when `amount` is small, performance
 /// is closer to `O(amount^2)`, and when `length` is close to `amount` then
@@ -186,8 +180,7 @@ impl ExactSizeIterator for IndexVecIntoIter {}
 /// to adapt the internal `sample_floyd` implementation.
 ///
 /// Panics if `amount > length`.
-pub fn sample<R>(rng: &mut R, length: usize, amount: usize,
-    shuffled: bool) -> IndexVec
+pub fn sample<R>(rng: &mut R, length: usize, amount: usize) -> IndexVec
     where R: Rng + ?Sized,
 {
     if amount > length {
@@ -205,8 +198,8 @@ pub fn sample<R>(rng: &mut R, length: usize, amount: usize,
     // https://github.com/rust-lang-nursery/rand/pull/479
     // We do some calculations with f32. Accuracy is not very important.
 
-    if amount < 217 {
-        const C: [[f32; 2]; 2] = [[1.2, 6.0/45.0], [10.0, 70.0/9.0]];
+    if amount < 163 {
+        const C: [[f32; 2]; 2] = [[1.6, 8.0/45.0], [10.0, 70.0/9.0]];
         let j = if length < 500_000 { 0 } else { 1 };
         let amount_fp = amount as f32;
         let m4 = C[0][j] * amount_fp;
@@ -214,7 +207,7 @@ pub fn sample<R>(rng: &mut R, length: usize, amount: usize,
         if amount > 11 && (length as f32) < (C[1][j] + m4) * amount_fp {
             sample_inplace(rng, length, amount)
         } else {
-            sample_floyd(rng, length, amount, shuffled)
+            sample_floyd(rng, length, amount)
         }
     } else {
         const C: [f32; 2] = [270.0, 330.0/9.0];
@@ -232,29 +225,50 @@ pub fn sample<R>(rng: &mut R, length: usize, amount: usize,
 /// Randomly sample exactly `amount` indices from `0..length`, using Floyd's
 /// combination algorithm.
 /// 
-/// If `shuffled == false`, the values are only partially shuffled (i.e. biases
-/// exist in the ordering of sampled elements). If `shuffled == true`, the
-/// values are fully shuffled.
+/// The output values are fully shuffled. (Overhead is under 50%.)
 ///
 /// This implementation uses `O(amount)` memory and `O(amount^2)` time.
-fn sample_floyd<R>(rng: &mut R, length: u32, amount: u32, shuffled: bool) -> IndexVec
+fn sample_floyd<R>(rng: &mut R, length: u32, amount: u32) -> IndexVec
     where R: Rng + ?Sized,
 {
+    // Shouldn't this be on std::slice?
+    fn find_pos<T: Copy + PartialEq<T>>(slice: &[T], elt: T) -> Option<usize> {
+        for i in 0..slice.len() {
+            if slice[i] == elt {
+                return Some(i);
+            }
+        }
+        None
+    }
+    
+    // For small amount we use Floyd's fully-shuffled variant. For larger
+    // amounts this is slow due to Vec::insert performance, so we shuffle
+    // afterwards. Benchmarks show little overhead from extra logic.
+    let floyd_shuffle = amount < 50;
+    
     debug_assert!(amount <= length);
     let mut indices = Vec::with_capacity(amount as usize);
     for j in length - amount .. length {
         let t = rng.gen_range(0, j + 1);
-        if indices.contains(&t) {
-            indices.push(j)
+        if floyd_shuffle {
+            if let Some(pos) = find_pos(&indices, t) {
+                indices.insert(pos, j);
+                continue;
+            }
         } else {
-            indices.push(t)
-        };
+            if indices.contains(&t) {
+                indices.push(j);
+                continue;
+            }
+        }
+        indices.push(t);
     }
-    if shuffled {
-        // Note that there is a variant of Floyd's algorithm with native full
-        // shuffling, but it is slow because it requires arbitrary insertions.
-        use super::SliceRandom;
-        indices.shuffle(rng);
+    if !floyd_shuffle {
+        // Reimplement SliceRandom::shuffle with smaller indices
+        for i in (1..amount).rev() {
+            // invariant: elements with index > i have been locked in place.
+            indices.swap(i as usize, rng.gen_range(0, i + 1) as usize);
+        }
     }
     IndexVec::from(indices)
 }
@@ -270,9 +284,7 @@ fn sample_floyd<R>(rng: &mut R, length: u32, amount: u32, shuffled: bool) -> Ind
 /// of memory; because of this we only implement for `u32` index (which improves
 /// performance in all cases).
 ///
-/// This is likely the fastest for small lengths since it avoids the need for
-/// allocations. Set-up is `O(length)` time and memory and shuffling is
-/// `O(amount)` time.
+/// Set-up is `O(length)` time and memory and shuffling is `O(amount)` time.
 fn sample_inplace<R>(rng: &mut R, length: u32, amount: u32) -> IndexVec
     where R: Rng + ?Sized,
 {
@@ -330,16 +342,16 @@ mod test {
 
         assert_eq!(sample_rejection(&mut r, 1, 0).len(), 0);
 
-        assert_eq!(sample_floyd(&mut r, 0, 0, false).len(), 0);
-        assert_eq!(sample_floyd(&mut r, 1, 0, false).len(), 0);
-        assert_eq!(sample_floyd(&mut r, 1, 1, false).into_vec(), vec![0]);
+        assert_eq!(sample_floyd(&mut r, 0, 0).len(), 0);
+        assert_eq!(sample_floyd(&mut r, 1, 0).len(), 0);
+        assert_eq!(sample_floyd(&mut r, 1, 1).into_vec(), vec![0]);
         
         // These algorithms should be fast with big numbers. Test average.
         let sum: usize = sample_rejection(&mut r, 1 << 25, 10)
                 .into_iter().sum();
         assert!(1 << 25 < sum && sum < (1 << 25) * 25);
         
-        let sum: usize = sample_floyd(&mut r, 1 << 25, 10, false)
+        let sum: usize = sample_floyd(&mut r, 1 << 25, 10)
                 .into_iter().sum();
         assert!(1 << 25 < sum && sum < (1 << 25) * 25);
     }
@@ -358,27 +370,27 @@ mod test {
         // A small length and relatively large amount should use inplace
         r.fill(&mut seed);
         let (length, amount): (usize, usize) = (100, 50);
-        let v1 = sample(&mut xor_rng(seed), length, amount, true);
+        let v1 = sample(&mut xor_rng(seed), length, amount);
         let v2 = sample_inplace(&mut xor_rng(seed), length as u32, amount as u32);
         assert!(v1.iter().all(|e| e < length));
         assert_eq!(v1, v2);
         
         // Test Floyd's alg does produce different results
-        let v3 = sample_floyd(&mut xor_rng(seed), length as u32, amount as u32, true);
+        let v3 = sample_floyd(&mut xor_rng(seed), length as u32, amount as u32);
         assert!(v1 != v3);
         
         // A large length and small amount should use Floyd
         r.fill(&mut seed);
         let (length, amount): (usize, usize) = (1<<20, 50);
-        let v1 = sample(&mut xor_rng(seed), length, amount, true);
-        let v2 = sample_floyd(&mut xor_rng(seed), length as u32, amount as u32, true);
+        let v1 = sample(&mut xor_rng(seed), length, amount);
+        let v2 = sample_floyd(&mut xor_rng(seed), length as u32, amount as u32);
         assert!(v1.iter().all(|e| e < length));
         assert_eq!(v1, v2);
         
         // A large length and larger amount should use cache
         r.fill(&mut seed);
         let (length, amount): (usize, usize) = (1<<20, 600);
-        let v1 = sample(&mut xor_rng(seed), length, amount, true);
+        let v1 = sample(&mut xor_rng(seed), length, amount);
         let v2 = sample_rejection(&mut xor_rng(seed), length, amount);
         assert!(v1.iter().all(|e| e < length));
         assert_eq!(v1, v2);
diff --git a/src/seq/mod.rs b/src/seq/mod.rs
index 830ddcaf6ea..4e06bac2863 100644
--- a/src/seq/mod.rs
+++ b/src/seq/mod.rs
@@ -58,18 +58,11 @@ pub trait SliceRandom {
         where R: Rng + ?Sized;
 
     /// Produces an iterator that chooses `amount` elements from the slice at
-    /// random without repeating any.
-    ///
+    /// random without repeating any, and returns them in random order.
+    /// 
     /// In case this API is not sufficiently flexible, use `index::sample` then
     /// apply the indices to the slice.
     /// 
-    /// If `shuffled == true` then the sampled values will be fully shuffled;
-    /// otherwise the values may only partially shuffled, depending on the
-    /// algorithm used (i.e. biases may exist in the ordering of sampled
-    /// elements). Depending on the algorithm used internally, full shuffling
-    /// may add significant overhead for `amount` > 10 or so, but not more
-    /// than double the time and often much less.
-    ///
     /// Complexity is expected to be the same as `index::sample`.
     /// 
     /// # Example
@@ -80,16 +73,16 @@ pub trait SliceRandom {
     /// let sample = "Hello, audience!".as_bytes();
     /// 
     /// // collect the results into a vector:
-    /// let v: Vec<u8> = sample.choose_multiple(&mut rng, 3, true).cloned().collect();
+    /// let v: Vec<u8> = sample.choose_multiple(&mut rng, 3).cloned().collect();
     /// 
     /// // store in a buffer:
     /// let mut buf = [0u8; 5];
-    /// for (b, slot) in sample.choose_multiple(&mut rng, buf.len(), true).zip(buf.iter_mut()) {
+    /// for (b, slot) in sample.choose_multiple(&mut rng, buf.len()).zip(buf.iter_mut()) {
     ///     *slot = *b;
     /// }
     /// ```
     #[cfg(feature = "alloc")]
-    fn choose_multiple<R>(&self, rng: &mut R, amount: usize, shuffled: bool) -> SliceChooseIter<Self, Self::Item>
+    fn choose_multiple<R>(&self, rng: &mut R, amount: usize) -> SliceChooseIter<Self, Self::Item>
         where R: Rng + ?Sized;
 
     /// Similar to [`choose`], where the likelihood of each outcome may be
@@ -315,7 +308,7 @@ impl<T> SliceRandom for [T] {
     }
 
     #[cfg(feature = "alloc")]
-    fn choose_multiple<R>(&self, rng: &mut R, amount: usize, shuffled: bool)
+    fn choose_multiple<R>(&self, rng: &mut R, amount: usize)
         -> SliceChooseIter<Self, Self::Item>
         where R: Rng + ?Sized
     {
@@ -323,7 +316,7 @@ impl<T> SliceRandom for [T] {
         SliceChooseIter {
             slice: self,
             _phantom: Default::default(),
-            indices: index::sample(rng, self.len(), amount, shuffled).into_iter(),
+            indices: index::sample(rng, self.len(), amount).into_iter(),
         }
     }
 
@@ -460,7 +453,7 @@ pub fn sample_slice<R, T>(rng: &mut R, slice: &[T], amount: usize) -> Vec<T>
     where R: Rng + ?Sized,
           T: Clone
 {
-    let indices = index::sample(rng, slice.len(), amount, true).into_iter();
+    let indices = index::sample(rng, slice.len(), amount).into_iter();
 
     let mut out = Vec::with_capacity(amount);
     out.extend(indices.map(|i| slice[i].clone()));
@@ -483,7 +476,7 @@ pub fn sample_slice<R, T>(rng: &mut R, slice: &[T], amount: usize) -> Vec<T>
 pub fn sample_slice_ref<'a, R, T>(rng: &mut R, slice: &'a [T], amount: usize) -> Vec<&'a T>
     where R: Rng + ?Sized
 {
-    let indices = index::sample(rng, slice.len(), amount, true).into_iter();
+    let indices = index::sample(rng, slice.len(), amount).into_iter();
 
     let mut out = Vec::with_capacity(amount);
     out.extend(indices.map(|i| &slice[i]));
@@ -679,8 +672,7 @@ mod test {
             r.fill(&mut seed);
 
             // assert the basics work
-            let regular = index::sample(
-                &mut xor_rng(seed), length, amount, true);
+            let regular = index::sample(&mut xor_rng(seed), length, amount);
             assert_eq!(regular.len(), amount);
             assert!(regular.iter().all(|e| e < length));