From 483b29a0d1cf9f4dadf344457c6398e1e541a51b Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Fri, 4 Jan 2019 19:26:13 +0000
Subject: [PATCH 01/41] Add function declaration for pairwise_sum

---
 src/numeric_util.rs | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index b36d11605..29b380341 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -6,9 +6,17 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 use std::cmp;
-
+use std::ops::Add;
+use num_traits::{self, Zero};
 use crate::LinalgScalar;
 
+pub(crate) fn pairwise_sum<A>(v: &[A]) -> A
+where
+    A: Clone + Add<Output=A> + Zero,
+{
+    unimplemented!()
+}
+
 /// Fold over the manually unrolled `xs` with `f`
 pub fn unrolled_fold<A, I, F>(mut xs: &[A], init: I, f: F) -> A
     where A: Clone,

From 75860b60e10d515c104f345b32097d87dbcbdb31 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Fri, 4 Jan 2019 19:26:59 +0000
Subject: [PATCH 02/41] Base case: array with 512 elements

---
 src/numeric_util.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index 29b380341..db83f7a7a 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -14,7 +14,11 @@ pub(crate) fn pairwise_sum<A>(v: &[A]) -> A
 where
     A: Clone + Add<Output=A> + Zero,
 {
-    unimplemented!()
+    if v.len() <= 512 {
+        unimplemented!()
+    } else {
+        unimplemented!()
+    }
 }
 
 /// Fold over the manually unrolled `xs` with `f`

From b8304c0a80aebc3e21ff5841074de354ec993a82 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Fri, 4 Jan 2019 19:27:36 +0000
Subject: [PATCH 03/41] Base case: use unrolled_fold

---
 src/numeric_util.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index db83f7a7a..00108c4e1 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -15,7 +15,7 @@ where
     A: Clone + Add<Output=A> + Zero,
 {
     if v.len() <= 512 {
-        unimplemented!()
+        return unrolled_fold(v, A::zero, A::add);
     } else {
         unimplemented!()
     }

From ec3722a93d07dade4eccc17b80ccedbfd741c15b Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Fri, 4 Jan 2019 19:30:25 +0000
Subject: [PATCH 04/41] Implemented algorithm for not-base case branch

---
 src/numeric_util.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index 00108c4e1..6ef6d900b 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -14,10 +14,13 @@ pub(crate) fn pairwise_sum<A>(v: &[A]) -> A
 where
     A: Clone + Add<Output=A> + Zero,
 {
-    if v.len() <= 512 {
+    let n = v.len();
+    if n <= 512 {
         return unrolled_fold(v, A::zero, A::add);
     } else {
-        unimplemented!()
+        let mid_index = n / 2;
+        let (v1, v2) = v.split_at(mid_index);
+        pairwise_sum(v1) + pairwise_sum(v2)
     }
 }
 

From 70d32b742bb2f6727b705a2a56223f4e3c4d3407 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Fri, 4 Jan 2019 20:52:14 +0000
Subject: [PATCH 05/41] Implemented pairwise summation algorithm for an
 iterator parameter

---
 src/numeric/impl_numeric.rs |  6 +++---
 src/numeric_util.rs         | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/numeric/impl_numeric.rs b/src/numeric/impl_numeric.rs
index 0016e21c5..179e0f852 100644
--- a/src/numeric/impl_numeric.rs
+++ b/src/numeric/impl_numeric.rs
@@ -33,14 +33,14 @@ impl<A, S, D> ArrayBase<S, D>
         where A: Clone + Add<Output=A> + num_traits::Zero,
     {
         if let Some(slc) = self.as_slice_memory_order() {
-            return numeric_util::unrolled_fold(slc, A::zero, A::add);
+            return numeric_util::pairwise_sum(&slc)
         }
         let mut sum = A::zero();
         for row in self.inner_rows() {
             if let Some(slc) = row.as_slice() {
-                sum = sum + numeric_util::unrolled_fold(slc, A::zero, A::add);
+                sum = sum + numeric_util::pairwise_sum(&slc);
             } else {
-                sum = sum + row.iter().fold(A::zero(), |acc, elt| acc + elt.clone());
+                sum = sum + numeric_util::iterator_pairwise_sum(row.iter());
             }
         }
         sum
diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index 6ef6d900b..d6beca9cb 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -24,6 +24,23 @@ where
     }
 }
 
+pub(crate) fn iterator_pairwise_sum<'a, I, A: 'a>(iter: I) -> A
+    where
+        I: Iterator<Item=&'a A>,
+        A: Clone + Add<Output=A> + Zero,
+{
+    let mut partial_sums = vec![];
+    let mut partial_sum = A::zero();
+    for (i, x) in iter.enumerate() {
+        partial_sum = partial_sum + x.clone();
+        if i % 512 == 511 {
+            partial_sums.push(partial_sum);
+            partial_sum = A::zero();
+        }
+    }
+    pairwise_sum(&partial_sums)
+}
+
 /// Fold over the manually unrolled `xs` with `f`
 pub fn unrolled_fold<A, I, F>(mut xs: &[A], init: I, f: F) -> A
     where A: Clone,

From c66dc98ef42038fb75fc2c70279b3c0ece03a5ec Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Fri, 4 Jan 2019 21:46:09 +0000
Subject: [PATCH 06/41] Implemented pairwise summation algorithm for an
 iterator parameter with array items

---
 src/numeric/impl_numeric.rs | 12 ++++++------
 src/numeric_util.rs         | 29 +++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/src/numeric/impl_numeric.rs b/src/numeric/impl_numeric.rs
index 179e0f852..9551a1d30 100644
--- a/src/numeric/impl_numeric.rs
+++ b/src/numeric/impl_numeric.rs
@@ -104,21 +104,21 @@ impl<A, S, D> ArrayBase<S, D>
               D: RemoveAxis,
     {
         let n = self.len_of(axis);
-        let mut res = Array::zeros(self.raw_dim().remove_axis(axis));
         let stride = self.strides()[axis.index()];
+        let mut res = Array::zeros(self.raw_dim().remove_axis(axis));
         if self.ndim() == 2 && stride == 1 {
             // contiguous along the axis we are summing
             let ax = axis.index();
             for (i, elt) in enumerate(&mut res) {
                 *elt = self.index_axis(Axis(1 - ax), i).sum();
             }
+            res
         } else {
-            for i in 0..n {
-                let view = self.index_axis(axis, i);
-                res = res + &view;
-            }
+            numeric_util::array_pairwise_sum(
+                (0..n).map(|i| self.index_axis(axis, i)),
+                || res.clone()
+            )
         }
-        res
     }
 
     /// Return mean along `axis`.
diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index d6beca9cb..5c4cebf66 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -8,6 +8,7 @@
 use std::cmp;
 use std::ops::Add;
 use num_traits::{self, Zero};
+use super::{ArrayBase, Array, Data, Dimension};
 use crate::LinalgScalar;
 
 pub(crate) fn pairwise_sum<A>(v: &[A]) -> A
@@ -41,6 +42,34 @@ pub(crate) fn iterator_pairwise_sum<'a, I, A: 'a>(iter: I) -> A
     pairwise_sum(&partial_sums)
 }
 
+pub(crate) fn array_pairwise_sum<I, A, S, D, F>(iter: I, zero: F) -> Array<A, D>
+    where
+        I: Iterator<Item=ArrayBase<S, D>>,
+        S: Data<Elem=A>,
+        D: Dimension,
+        A: Clone + Add<Output=A>,
+        F: Fn() -> Array<A, D>,
+{
+    let mut partial_sums = vec![];
+    let mut partial_sum = zero();
+    for (i, x) in iter.enumerate() {
+        partial_sum = partial_sum + x;
+        if i % 512 == 511 {
+            partial_sums.push(partial_sum);
+            partial_sum = zero();
+        }
+    }
+    if partial_sums.len() > 512 {
+        array_pairwise_sum(partial_sums.into_iter(), zero)
+    } else {
+        let mut res = zero();
+        for x in partial_sums.iter() {
+            res = res + x;
+        }
+        res
+    }
+}
+
 /// Fold over the manually unrolled `xs` with `f`
 pub fn unrolled_fold<A, I, F>(mut xs: &[A], init: I, f: F) -> A
     where A: Clone,

From 175f9a2c2c2416291a895ecff17e36d0ac316e12 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Fri, 4 Jan 2019 21:49:27 +0000
Subject: [PATCH 07/41] Refactored: use fold to sum

---
 src/numeric_util.rs | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index 5c4cebf66..fd36bd769 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -59,14 +59,16 @@ pub(crate) fn array_pairwise_sum<I, A, S, D, F>(iter: I, zero: F) -> Array<A, D>
             partial_sum = zero();
         }
     }
-    if partial_sums.len() > 512 {
-        array_pairwise_sum(partial_sums.into_iter(), zero)
+
+    if partial_sums.len() <= 512 {
+        partial_sums
+            .iter()
+            .fold(
+                zero(),
+                |acc, elem| acc + elem
+            )
     } else {
-        let mut res = zero();
-        for x in partial_sums.iter() {
-            res = res + x;
-        }
-        res
+        array_pairwise_sum(partial_sums.into_iter(), zero)
     }
 }
 

From 8e403af379ea64420682e4d16fc5bec43d72516c Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Fri, 4 Jan 2019 21:57:39 +0000
Subject: [PATCH 08/41] Refactored: use a constant to reuse size value of
 recursion base case across implementations

---
 src/numeric_util.rs | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index fd36bd769..8e3482177 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -11,12 +11,14 @@ use num_traits::{self, Zero};
 use super::{ArrayBase, Array, Data, Dimension};
 use crate::LinalgScalar;
 
+const NAIVE_SUM_THRESHOLD: usize = 512;
+
 pub(crate) fn pairwise_sum<A>(v: &[A]) -> A
 where
     A: Clone + Add<Output=A> + Zero,
 {
     let n = v.len();
-    if n <= 512 {
+    if n <= NAIVE_SUM_THRESHOLD {
         return unrolled_fold(v, A::zero, A::add);
     } else {
         let mid_index = n / 2;
@@ -26,15 +28,15 @@ where
 }
 
 pub(crate) fn iterator_pairwise_sum<'a, I, A: 'a>(iter: I) -> A
-    where
-        I: Iterator<Item=&'a A>,
-        A: Clone + Add<Output=A> + Zero,
+where
+    I: Iterator<Item=&'a A>,
+    A: Clone + Add<Output=A> + Zero,
 {
     let mut partial_sums = vec![];
     let mut partial_sum = A::zero();
     for (i, x) in iter.enumerate() {
         partial_sum = partial_sum + x.clone();
-        if i % 512 == 511 {
+        if i % NAIVE_SUM_THRESHOLD == NAIVE_SUM_THRESHOLD - 1 {
             partial_sums.push(partial_sum);
             partial_sum = A::zero();
         }
@@ -43,30 +45,25 @@ pub(crate) fn iterator_pairwise_sum<'a, I, A: 'a>(iter: I) -> A
 }
 
 pub(crate) fn array_pairwise_sum<I, A, S, D, F>(iter: I, zero: F) -> Array<A, D>
-    where
-        I: Iterator<Item=ArrayBase<S, D>>,
-        S: Data<Elem=A>,
-        D: Dimension,
-        A: Clone + Add<Output=A>,
-        F: Fn() -> Array<A, D>,
+where
+    I: Iterator<Item=ArrayBase<S, D>>,
+    S: Data<Elem=A>,
+    D: Dimension,
+    A: Clone + Add<Output=A>,
+    F: Fn() -> Array<A, D>,
 {
     let mut partial_sums = vec![];
     let mut partial_sum = zero();
     for (i, x) in iter.enumerate() {
         partial_sum = partial_sum + x;
-        if i % 512 == 511 {
+        if i % NAIVE_SUM_THRESHOLD == NAIVE_SUM_THRESHOLD - 1 {
             partial_sums.push(partial_sum);
             partial_sum = zero();
         }
     }
 
-    if partial_sums.len() <= 512 {
-        partial_sums
-            .iter()
-            .fold(
-                zero(),
-                |acc, elem| acc + elem
-            )
+    if partial_sums.len() <= NAIVE_SUM_THRESHOLD {
+        partial_sums.iter().fold(zero(), |acc, elem| acc + elem)
     } else {
         array_pairwise_sum(partial_sums.into_iter(), zero)
     }

From 465063f7b794d8ae87897bb5b646fefcc807ddf1 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Fri, 4 Jan 2019 22:16:39 +0000
Subject: [PATCH 09/41] Added documentation

---
 src/numeric_util.rs | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index 8e3482177..e45e38b51 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -11,8 +11,25 @@ use num_traits::{self, Zero};
 use super::{ArrayBase, Array, Data, Dimension};
 use crate::LinalgScalar;
 
+/// Size threshold to switch to naive summation in all implementations of pairwise summation.
 const NAIVE_SUM_THRESHOLD: usize = 512;
 
+/// An implementation of pairwise summation for a vector slice.
+///
+/// Pairwise summation compute the sum of a set of *n* numbers by splitting
+/// it recursively in two halves, summing their elements and then adding the respective
+/// sums.
+/// It switches to the naive sum algorithm once the size of the set to be summed
+/// is below a certain pre-defined threshold ([`threshold`]).
+///
+/// Pairwise summation is useful to reduce the accumulated round-off error
+/// when summing floating point numbers.
+/// Pairwise summation provides an asymptotic error bound of *O(eps log n)*, where
+/// *eps* is machine precision, compared to *O(eps n)* of the naive summation algorithm.
+/// For more details, see [`paper`].
+///
+/// [`paper`]: https://epubs.siam.org/doi/10.1137/0914050
+/// [`threshold`]: constant.NAIVE_SUM_THRESHOLD.html
 pub(crate) fn pairwise_sum<A>(v: &[A]) -> A
 where
     A: Clone + Add<Output=A> + Zero,
@@ -27,6 +44,11 @@ where
     }
 }
 
+/// An implementation of pairwise summation for an iterator.
+///
+/// See [`pairwise_sum`] for details on the algorithm.
+///
+/// [`pairwise_sum`]: fn.pairwise_sum.html
 pub(crate) fn iterator_pairwise_sum<'a, I, A: 'a>(iter: I) -> A
 where
     I: Iterator<Item=&'a A>,
@@ -44,6 +66,11 @@ where
     pairwise_sum(&partial_sums)
 }
 
+/// An implementation of pairwise summation for an iterator over *n*-dimensional arrays.
+///
+/// See [`pairwise_sum`] for details on the algorithm.
+///
+/// [`pairwise_sum`]: fn.pairwise_sum.html
 pub(crate) fn array_pairwise_sum<I, A, S, D, F>(iter: I, zero: F) -> Array<A, D>
 where
     I: Iterator<Item=ArrayBase<S, D>>,

From 6427d45cdfd6ccda9530e229351605d2f6208e04 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Fri, 4 Jan 2019 22:22:42 +0000
Subject: [PATCH 10/41] Minor edits to the docs

---
 src/numeric_util.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index e45e38b51..bae732438 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -24,11 +24,12 @@ const NAIVE_SUM_THRESHOLD: usize = 512;
 ///
 /// Pairwise summation is useful to reduce the accumulated round-off error
 /// when summing floating point numbers.
-/// Pairwise summation provides an asymptotic error bound of *O(eps log n)*, where
-/// *eps* is machine precision, compared to *O(eps n)* of the naive summation algorithm.
-/// For more details, see [`paper`].
+/// Pairwise summation provides an asymptotic error bound of *O(ε* log *n)*, where
+/// *ε* is machine precision, compared to *O(εn)* of the naive summation algorithm.
+/// For more details, see [`paper`] or [`Wikipedia`].
 ///
 /// [`paper`]: https://epubs.siam.org/doi/10.1137/0914050
+/// [`Wikipedia`]: https://en.wikipedia.org/wiki/Pairwise_summation
 /// [`threshold`]: constant.NAIVE_SUM_THRESHOLD.html
 pub(crate) fn pairwise_sum<A>(v: &[A]) -> A
 where

From 4414450a30641fcd4999c43e5775b6947a9134cc Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Fri, 4 Jan 2019 22:51:30 +0000
Subject: [PATCH 11/41] Don't forget to add the sum of the last elements (<512
 ending block).

---
 src/numeric_util.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index bae732438..fdf61360e 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -64,6 +64,8 @@ where
             partial_sum = A::zero();
         }
     }
+    partial_sums.push(partial_sum);
+
     pairwise_sum(&partial_sums)
 }
 
@@ -89,6 +91,7 @@ where
             partial_sum = zero();
         }
     }
+    partial_sums.push(partial_sum);
 
     if partial_sums.len() <= NAIVE_SUM_THRESHOLD {
         partial_sums.iter().fold(zero(), |acc, elem| acc + elem)

From aeaad0ee8cdd58119fb90064fdf2cc7c96260250 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Sat, 5 Jan 2019 12:24:26 +0000
Subject: [PATCH 12/41] Add a benchmark for summing a contiguous array

---
 benches/numeric.rs | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/benches/numeric.rs b/benches/numeric.rs
index db0252c8a..1686d3874 100644
--- a/benches/numeric.rs
+++ b/benches/numeric.rs
@@ -1,6 +1,5 @@
 
 #![feature(test)]
-
 extern crate test;
 use test::Bencher;
 
@@ -25,3 +24,13 @@ fn clip(bench: &mut Bencher)
         })
     });
 }
+
+#[bench]
+fn contiguous_sum(bench: &mut Bencher)
+{
+    let n = 100000;
+    let a = Array::linspace(-1e6, 1e6, n);
+    bench.iter(|| {
+        a.sum()
+    });
+}

From 75109b1e66548f3c9fa246f2d6a32d251ede5814 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Sat, 5 Jan 2019 12:30:13 +0000
Subject: [PATCH 13/41] Benchmarks for arrays of different length

---
 benches/numeric.rs | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/benches/numeric.rs b/benches/numeric.rs
index 1686d3874..02c43aa53 100644
--- a/benches/numeric.rs
+++ b/benches/numeric.rs
@@ -26,11 +26,32 @@ fn clip(bench: &mut Bencher)
 }
 
 #[bench]
-fn contiguous_sum(bench: &mut Bencher)
+fn contiguous_sum_1e7(bench: &mut Bencher)
 {
-    let n = 100000;
+    let n = 1e7 as usize;
     let a = Array::linspace(-1e6, 1e6, n);
     bench.iter(|| {
         a.sum()
     });
 }
+
+#[bench]
+fn contiguous_sum_1e4(bench: &mut Bencher)
+{
+    let n = 1e4 as usize;
+    let a = Array::linspace(-1e6, 1e6, n);
+    bench.iter(|| {
+        a.sum()
+    });
+}
+
+#[bench]
+fn contiguous_sum_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::linspace(-1e6, 1e6, n);
+    bench.iter(|| {
+        a.sum()
+    });
+}
+

From 30851943b6f0a5f752d0aa0a5419235b297ce633 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Sat, 5 Jan 2019 12:56:01 +0000
Subject: [PATCH 14/41] Don't split midpoint, saving one operation

---
 src/numeric_util.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index fdf61360e..f7a3f96bc 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -37,10 +37,9 @@ where
 {
     let n = v.len();
     if n <= NAIVE_SUM_THRESHOLD {
-        return unrolled_fold(v, A::zero, A::add);
+        unrolled_fold(v, A::zero, A::add)
     } else {
-        let mid_index = n / 2;
-        let (v1, v2) = v.split_at(mid_index);
+        let (v1, v2) = v.split_at(NAIVE_SUM_THRESHOLD);
         pairwise_sum(v1) + pairwise_sum(v2)
     }
 }

From 797e212c23e8c0569ca419b191071b93468b19af Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Sat, 5 Jan 2019 12:56:57 +0000
Subject: [PATCH 15/41] Revert "Don't split midpoint, saving one operation"

This reverts commit 30851943b6f0a5f752d0aa0a5419235b297ce633.
---
 src/numeric_util.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index f7a3f96bc..fdf61360e 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -37,9 +37,10 @@ where
 {
     let n = v.len();
     if n <= NAIVE_SUM_THRESHOLD {
-        unrolled_fold(v, A::zero, A::add)
+        return unrolled_fold(v, A::zero, A::add);
     } else {
-        let (v1, v2) = v.split_at(NAIVE_SUM_THRESHOLD);
+        let mid_index = n / 2;
+        let (v1, v2) = v.split_at(mid_index);
         pairwise_sum(v1) + pairwise_sum(v2)
     }
 }

From d2b636b635b5790fd31ca4ff0acde9ba25b173eb Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Sat, 5 Jan 2019 13:13:16 +0000
Subject: [PATCH 16/41] Benches for sum_axis

---
 benches/numeric.rs | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/benches/numeric.rs b/benches/numeric.rs
index 02c43aa53..4239652bb 100644
--- a/benches/numeric.rs
+++ b/benches/numeric.rs
@@ -55,3 +55,26 @@ fn contiguous_sum_1e2(bench: &mut Bencher)
     });
 }
 
+#[bench]
+fn sum_by_row_1e4(bench: &mut Bencher)
+{
+    let n = 1e3 as usize;
+    let a = Array::linspace(-1e6, 1e6, n * n)
+        .into_shape([n, n])
+        .unwrap();
+    bench.iter(|| {
+        a.sum_axis(Axis(0))
+    });
+}
+
+#[bench]
+fn sum_by_col_1e4(bench: &mut Bencher)
+{
+    let n = 1e3 as usize;
+    let a = Array::linspace(-1e6, 1e6, n * n)
+        .into_shape([n, n])
+        .unwrap();
+    bench.iter(|| {
+        a.sum_axis(Axis(1))
+    });
+}

From b3d2b42325f0b78fe3f1dcbd8bc327bde2f87db8 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Sat, 5 Jan 2019 13:25:53 +0000
Subject: [PATCH 17/41] Bench for contiguous sum with integer values

---
 benches/numeric.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/benches/numeric.rs b/benches/numeric.rs
index 4239652bb..76d07f1e4 100644
--- a/benches/numeric.rs
+++ b/benches/numeric.rs
@@ -25,6 +25,16 @@ fn clip(bench: &mut Bencher)
     });
 }
 
+#[bench]
+fn contiguous_sum_int_1e4(bench: &mut Bencher)
+{
+    let n = 1e4 as usize;
+    let a = Array::from_vec((0..n).collect());
+    bench.iter(|| {
+        a.sum()
+    });
+}
+
 #[bench]
 fn contiguous_sum_1e7(bench: &mut Bencher)
 {

From 8f95705c4ab045e87a077e12e67edcb5f530177f Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Wed, 9 Jan 2019 08:48:45 +0000
Subject: [PATCH 18/41] Alternative implementation for sum_axis

---
 src/numeric/impl_numeric.rs | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/src/numeric/impl_numeric.rs b/src/numeric/impl_numeric.rs
index 9551a1d30..bed2ae78c 100644
--- a/src/numeric/impl_numeric.rs
+++ b/src/numeric/impl_numeric.rs
@@ -103,22 +103,11 @@ impl<A, S, D> ArrayBase<S, D>
         where A: Clone + Zero + Add<Output=A>,
               D: RemoveAxis,
     {
-        let n = self.len_of(axis);
-        let stride = self.strides()[axis.index()];
-        let mut res = Array::zeros(self.raw_dim().remove_axis(axis));
-        if self.ndim() == 2 && stride == 1 {
-            // contiguous along the axis we are summing
-            let ax = axis.index();
-            for (i, elt) in enumerate(&mut res) {
-                *elt = self.index_axis(Axis(1 - ax), i).sum();
-            }
-            res
-        } else {
-            numeric_util::array_pairwise_sum(
-                (0..n).map(|i| self.index_axis(axis, i)),
-                || res.clone()
-            )
-        }
+        let mut out = Array::zeros(self.dim.remove_axis(axis));
+        Zip::from(&mut out)
+            .and(self.lanes(axis))
+            .apply(|out, lane| *out = lane.sum());
+        out
     }
 
     /// Return mean along `axis`.

From 74a74ae2a745b77eee467f90dc18009a53b4fa59 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Wed, 9 Jan 2019 08:55:25 +0000
Subject: [PATCH 19/41] Revert "Alternative implementation for sum_axis"

This reverts commit 8f95705c4ab045e87a077e12e67edcb5f530177f.
---
 src/numeric/impl_numeric.rs | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/numeric/impl_numeric.rs b/src/numeric/impl_numeric.rs
index bed2ae78c..9551a1d30 100644
--- a/src/numeric/impl_numeric.rs
+++ b/src/numeric/impl_numeric.rs
@@ -103,11 +103,22 @@ impl<A, S, D> ArrayBase<S, D>
         where A: Clone + Zero + Add<Output=A>,
               D: RemoveAxis,
     {
-        let mut out = Array::zeros(self.dim.remove_axis(axis));
-        Zip::from(&mut out)
-            .and(self.lanes(axis))
-            .apply(|out, lane| *out = lane.sum());
-        out
+        let n = self.len_of(axis);
+        let stride = self.strides()[axis.index()];
+        let mut res = Array::zeros(self.raw_dim().remove_axis(axis));
+        if self.ndim() == 2 && stride == 1 {
+            // contiguous along the axis we are summing
+            let ax = axis.index();
+            for (i, elt) in enumerate(&mut res) {
+                *elt = self.index_axis(Axis(1 - ax), i).sum();
+            }
+            res
+        } else {
+            numeric_util::array_pairwise_sum(
+                (0..n).map(|i| self.index_axis(axis, i)),
+                || res.clone()
+            )
+        }
     }
 
     /// Return mean along `axis`.

From a592a7dc537072a174fa6c4affdef7c483a122f8 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Wed, 9 Jan 2019 08:57:23 +0000
Subject: [PATCH 20/41] Ensure equal block size independently of underlying
 implementation

---
 src/numeric_util.rs | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index fdf61360e..fe41aa87f 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -12,7 +12,8 @@ use super::{ArrayBase, Array, Data, Dimension};
 use crate::LinalgScalar;
 
 /// Size threshold to switch to naive summation in all implementations of pairwise summation.
-const NAIVE_SUM_THRESHOLD: usize = 512;
+const SIMD_NAIVE_SUM_THRESHOLD: usize = 512;
+const NO_SIMD_NAIVE_SUM_THRESHOLD: usize = SIMD_NAIVE_SUM_THRESHOLD / 8;
 
 /// An implementation of pairwise summation for a vector slice.
 ///
@@ -36,7 +37,7 @@ where
     A: Clone + Add<Output=A> + Zero,
 {
     let n = v.len();
-    if n <= NAIVE_SUM_THRESHOLD {
+    if n <= SIMD_NAIVE_SUM_THRESHOLD {
         return unrolled_fold(v, A::zero, A::add);
     } else {
         let mid_index = n / 2;
@@ -59,7 +60,7 @@ where
     let mut partial_sum = A::zero();
     for (i, x) in iter.enumerate() {
         partial_sum = partial_sum + x.clone();
-        if i % NAIVE_SUM_THRESHOLD == NAIVE_SUM_THRESHOLD - 1 {
+        if i % NO_SIMD_NAIVE_SUM_THRESHOLD == NO_SIMD_NAIVE_SUM_THRESHOLD - 1 {
             partial_sums.push(partial_sum);
             partial_sum = A::zero();
         }
@@ -86,14 +87,14 @@ where
     let mut partial_sum = zero();
     for (i, x) in iter.enumerate() {
         partial_sum = partial_sum + x;
-        if i % NAIVE_SUM_THRESHOLD == NAIVE_SUM_THRESHOLD - 1 {
+        if i % NO_SIMD_NAIVE_SUM_THRESHOLD == NO_SIMD_NAIVE_SUM_THRESHOLD - 1 {
             partial_sums.push(partial_sum);
             partial_sum = zero();
         }
     }
     partial_sums.push(partial_sum);
 
-    if partial_sums.len() <= NAIVE_SUM_THRESHOLD {
+    if partial_sums.len() <= NO_SIMD_NAIVE_SUM_THRESHOLD {
         partial_sums.iter().fold(zero(), |acc, elem| acc + elem)
     } else {
         array_pairwise_sum(partial_sums.into_iter(), zero)

From f73fb2d449c604f2ab0bd57d3f8f5b2680a76bf2 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Tue, 22 Jan 2019 07:47:05 +0000
Subject: [PATCH 21/41] Change threshold names

---
 src/numeric_util.rs | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index fe41aa87f..80047c171 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -12,8 +12,9 @@ use super::{ArrayBase, Array, Data, Dimension};
 use crate::LinalgScalar;
 
 /// Size threshold to switch to naive summation in all implementations of pairwise summation.
-const SIMD_NAIVE_SUM_THRESHOLD: usize = 512;
-const NO_SIMD_NAIVE_SUM_THRESHOLD: usize = SIMD_NAIVE_SUM_THRESHOLD / 8;
+const NAIVE_SUM_THRESHOLD: usize = 64;
+/// Number of elements processed by unrolled operators (to leverage SIMD instructions).
+const UNROLL_SIZE: usize = 8;
 
 /// An implementation of pairwise summation for a vector slice.
 ///
@@ -37,7 +38,7 @@ where
     A: Clone + Add<Output=A> + Zero,
 {
     let n = v.len();
-    if n <= SIMD_NAIVE_SUM_THRESHOLD {
+    if n <= NAIVE_SUM_THRESHOLD * UNROLL_SIZE {
         return unrolled_fold(v, A::zero, A::add);
     } else {
         let mid_index = n / 2;
@@ -60,7 +61,7 @@ where
     let mut partial_sum = A::zero();
     for (i, x) in iter.enumerate() {
         partial_sum = partial_sum + x.clone();
-        if i % NO_SIMD_NAIVE_SUM_THRESHOLD == NO_SIMD_NAIVE_SUM_THRESHOLD - 1 {
+        if i % NAIVE_SUM_THRESHOLD == NAIVE_SUM_THRESHOLD - 1 {
             partial_sums.push(partial_sum);
             partial_sum = A::zero();
         }
@@ -87,14 +88,14 @@ where
     let mut partial_sum = zero();
     for (i, x) in iter.enumerate() {
         partial_sum = partial_sum + x;
-        if i % NO_SIMD_NAIVE_SUM_THRESHOLD == NO_SIMD_NAIVE_SUM_THRESHOLD - 1 {
+        if i % NAIVE_SUM_THRESHOLD == NAIVE_SUM_THRESHOLD - 1 {
             partial_sums.push(partial_sum);
             partial_sum = zero();
         }
     }
     partial_sums.push(partial_sum);
 
-    if partial_sums.len() <= NO_SIMD_NAIVE_SUM_THRESHOLD {
+    if partial_sums.len() <= NAIVE_SUM_THRESHOLD {
         partial_sums.iter().fold(zero(), |acc, elem| acc + elem)
     } else {
         array_pairwise_sum(partial_sums.into_iter(), zero)

From c7fa091c9d160ec5e173783cb350aacf0c5dc368 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Tue, 22 Jan 2019 07:55:18 +0000
Subject: [PATCH 22/41] Change sum_axis implementation

---
 src/numeric/impl_numeric.rs | 10 +++++-----
 src/numeric_util.rs         | 33 +--------------------------------
 2 files changed, 6 insertions(+), 37 deletions(-)

diff --git a/src/numeric/impl_numeric.rs b/src/numeric/impl_numeric.rs
index 9551a1d30..15ddd789e 100644
--- a/src/numeric/impl_numeric.rs
+++ b/src/numeric/impl_numeric.rs
@@ -105,19 +105,19 @@ impl<A, S, D> ArrayBase<S, D>
     {
         let n = self.len_of(axis);
         let stride = self.strides()[axis.index()];
-        let mut res = Array::zeros(self.raw_dim().remove_axis(axis));
         if self.ndim() == 2 && stride == 1 {
             // contiguous along the axis we are summing
+            let mut res = Array::zeros(self.raw_dim().remove_axis(axis));
             let ax = axis.index();
             for (i, elt) in enumerate(&mut res) {
                 *elt = self.index_axis(Axis(1 - ax), i).sum();
             }
             res
+        } else if self.len_of(axis) <= numeric_util::NAIVE_SUM_THRESHOLD {
+            self.fold_axis(axis, A::zero(), |acc, x| acc.clone() + x.clone())
         } else {
-            numeric_util::array_pairwise_sum(
-                (0..n).map(|i| self.index_axis(axis, i)),
-                || res.clone()
-            )
+            let (v1, v2) = self.view().split_at(axis, n / 2);
+            v1.sum_axis(axis) + v2.sum_axis(axis)
         }
     }
 
diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index 80047c171..f36e81e97 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -12,7 +12,7 @@ use super::{ArrayBase, Array, Data, Dimension};
 use crate::LinalgScalar;
 
 /// Size threshold to switch to naive summation in all implementations of pairwise summation.
-const NAIVE_SUM_THRESHOLD: usize = 64;
+pub(crate) const NAIVE_SUM_THRESHOLD: usize = 64;
 /// Number of elements processed by unrolled operators (to leverage SIMD instructions).
 const UNROLL_SIZE: usize = 8;
 
@@ -71,37 +71,6 @@ where
     pairwise_sum(&partial_sums)
 }
 
-/// An implementation of pairwise summation for an iterator over *n*-dimensional arrays.
-///
-/// See [`pairwise_sum`] for details on the algorithm.
-///
-/// [`pairwise_sum`]: fn.pairwise_sum.html
-pub(crate) fn array_pairwise_sum<I, A, S, D, F>(iter: I, zero: F) -> Array<A, D>
-where
-    I: Iterator<Item=ArrayBase<S, D>>,
-    S: Data<Elem=A>,
-    D: Dimension,
-    A: Clone + Add<Output=A>,
-    F: Fn() -> Array<A, D>,
-{
-    let mut partial_sums = vec![];
-    let mut partial_sum = zero();
-    for (i, x) in iter.enumerate() {
-        partial_sum = partial_sum + x;
-        if i % NAIVE_SUM_THRESHOLD == NAIVE_SUM_THRESHOLD - 1 {
-            partial_sums.push(partial_sum);
-            partial_sum = zero();
-        }
-    }
-    partial_sums.push(partial_sum);
-
-    if partial_sums.len() <= NAIVE_SUM_THRESHOLD {
-        partial_sums.iter().fold(zero(), |acc, elem| acc + elem)
-    } else {
-        array_pairwise_sum(partial_sums.into_iter(), zero)
-    }
-}
-
 /// Fold over the manually unrolled `xs` with `f`
 pub fn unrolled_fold<A, I, F>(mut xs: &[A], init: I, f: F) -> A
     where A: Clone,

From f72164ab26b31a62c03ae80de2ddc51e31a2bf62 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Tue, 22 Jan 2019 07:59:29 +0000
Subject: [PATCH 23/41] Reduce partial accumulators pairwise in unrolled_fold

---
 src/numeric_util.rs | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index f36e81e97..ea838b7e4 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -79,7 +79,6 @@ pub fn unrolled_fold<A, I, F>(mut xs: &[A], init: I, f: F) -> A
 {
     // eightfold unrolled so that floating point can be vectorized
     // (even with strict floating point accuracy semantics)
-    let mut acc = init();
     let (mut p0, mut p1, mut p2, mut p3,
          mut p4, mut p5, mut p6, mut p7) =
         (init(), init(), init(), init(),
@@ -96,18 +95,19 @@ pub fn unrolled_fold<A, I, F>(mut xs: &[A], init: I, f: F) -> A
 
         xs = &xs[8..];
     }
-    acc = f(acc.clone(), f(p0, p4));
-    acc = f(acc.clone(), f(p1, p5));
-    acc = f(acc.clone(), f(p2, p6));
-    acc = f(acc.clone(), f(p3, p7));
+    let (q0, q1, q2, q3) = (f(p0, p4), f(p1, p5), f(p2, p6), f(p3, p7));
+    let (r0, r1) = (f(q0, q2), f(q1, q3));
+    let unrolled = f(r0, r1);
 
     // make it clear to the optimizer that this loop is short
     // and can not be autovectorized.
+    let mut partial = init();
     for i in 0..xs.len() {
         if i >= 7 { break; }
-        acc = f(acc.clone(), xs[i].clone())
+        partial = f(partial.clone(), xs[i].clone())
     }
-    acc
+
+    f(unrolled, partial)
 }
 
 /// Compute the dot product.

From 9f1c4d261cf23106a117d1ac31b4bf53b7652dd0 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Tue, 22 Jan 2019 08:00:44 +0000
Subject: [PATCH 24/41] Remove unused imports

---
 src/numeric_util.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index ea838b7e4..17470446a 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -8,7 +8,6 @@
 use std::cmp;
 use std::ops::Add;
 use num_traits::{self, Zero};
-use super::{ArrayBase, Array, Data, Dimension};
 use crate::LinalgScalar;
 
 /// Size threshold to switch to naive summation in all implementations of pairwise summation.

From bbc4a756947e5b4bcbb39e5fefa0e3628de860a9 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Tue, 22 Jan 2019 08:09:41 +0000
Subject: [PATCH 25/41] Get uniform behaviour across all pairwise_sum
 implementations

---
 src/numeric_util.rs | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index 17470446a..bcd1080e8 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -67,7 +67,25 @@ where
     }
     partial_sums.push(partial_sum);
 
-    pairwise_sum(&partial_sums)
+    pure_pairwise_sum(&partial_sums)
+}
+
+/// An implementation of pairwise summation for a vector slice that never
+/// switches to the naive sum algorithm.
+fn pure_pairwise_sum<A>(v: &[A]) -> A
+    where
+        A: Clone + Add<Output=A> + Zero,
+{
+    let n = v.len();
+    match n {
+        0 => A::zero(),
+        1 => v[0].clone(),
+        n => {
+            let mid_index = n / 2;
+            let (v1, v2) = v.split_at(mid_index);
+            pure_pairwise_sum(v1) + pure_pairwise_sum(v2)
+        }
+    }
 }
 
 /// Fold over the manually unrolled `xs` with `f`

From b98e30b2d59d30df7de7238bc29ba36903026e32 Mon Sep 17 00:00:00 2001
From: Jim Turner <github@turner.link>
Date: Sat, 2 Feb 2019 22:31:07 -0500
Subject: [PATCH 26/41] Add more benchmarks of sum/sum_axis

---
 benches/numeric.rs | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/benches/numeric.rs b/benches/numeric.rs
index 76d07f1e4..5a7c111a5 100644
--- a/benches/numeric.rs
+++ b/benches/numeric.rs
@@ -1,7 +1,7 @@
 
 #![feature(test)]
 extern crate test;
-use test::Bencher;
+use test::{black_box, Bencher};
 
 extern crate ndarray;
 use ndarray::prelude::*;
@@ -65,6 +65,38 @@ fn contiguous_sum_1e2(bench: &mut Bencher)
     });
 }
 
+#[bench]
+fn contiguous_sum_ix3_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::linspace(-1e6, 1e6, n * n * n)
+        .into_shape([n, n, n])
+        .unwrap();
+    bench.iter(|| black_box(&a).sum());
+}
+
+#[bench]
+fn inner_discontiguous_sum_ix3_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::linspace(-1e6, 1e6, n * n * 2*n)
+        .into_shape([n, n, 2*n])
+        .unwrap();
+    let v = a.slice(s![.., .., ..;2]);
+    bench.iter(|| black_box(&v).sum());
+}
+
+#[bench]
+fn middle_discontiguous_sum_ix3_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::linspace(-1e6, 1e6, n * 2*n * n)
+        .into_shape([n, 2*n, n])
+        .unwrap();
+    let v = a.slice(s![.., ..;2, ..]);
+    bench.iter(|| black_box(&v).sum());
+}
+
 #[bench]
 fn sum_by_row_1e4(bench: &mut Bencher)
 {
@@ -88,3 +120,15 @@ fn sum_by_col_1e4(bench: &mut Bencher)
         a.sum_axis(Axis(1))
     });
 }
+
+#[bench]
+fn sum_by_middle_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::linspace(-1e6, 1e6, n * n * n)
+        .into_shape([n, n, n])
+        .unwrap();
+    bench.iter(|| {
+        a.sum_axis(Axis(1))
+    });
+}

From ed88e2e9a8445c1f9dccf1d408f6a79ea1ace17b Mon Sep 17 00:00:00 2001
From: Jim Turner <github@turner.link>
Date: Sat, 2 Feb 2019 23:32:56 -0500
Subject: [PATCH 27/41] Improve performance of iterator_pairwise_sum

---
 src/numeric_util.rs | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index bcd1080e8..23966b9a9 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -56,16 +56,18 @@ where
     I: Iterator<Item=&'a A>,
     A: Clone + Add<Output=A> + Zero,
 {
-    let mut partial_sums = vec![];
-    let mut partial_sum = A::zero();
-    for (i, x) in iter.enumerate() {
-        partial_sum = partial_sum + x.clone();
-        if i % NAIVE_SUM_THRESHOLD == NAIVE_SUM_THRESHOLD - 1 {
+    let (len, _) = iter.size_hint();
+    let cap = len.saturating_sub(1) / NAIVE_SUM_THRESHOLD + 1; // ceiling of division
+    let mut partial_sums = Vec::with_capacity(cap);
+    let (_, last_sum) = iter.fold((0, A::zero()), |(count, partial_sum), x| {
+        if count < NAIVE_SUM_THRESHOLD {
+            (count + 1, partial_sum + x.clone())
+        } else {
             partial_sums.push(partial_sum);
-            partial_sum = A::zero();
+            (1, x.clone())
         }
-    }
-    partial_sums.push(partial_sum);
+    });
+    partial_sums.push(last_sum);
 
     pure_pairwise_sum(&partial_sums)
 }
@@ -205,3 +207,17 @@ pub fn unrolled_eq<A>(xs: &[A], ys: &[A]) -> bool
 
     true
 }
+
+#[cfg(test)]
+mod tests {
+    use quickcheck::quickcheck;
+    use std::num::Wrapping;
+    use super::iterator_pairwise_sum;
+
+    quickcheck! {
+        fn iterator_pairwise_sum_is_correct(xs: Vec<i32>) -> bool {
+            let xs: Vec<_> = xs.into_iter().map(|x| Wrapping(x)).collect();
+            iterator_pairwise_sum(xs.iter()) == xs.iter().sum()
+        }
+    }
+}

From e7835ee671eb547bab955251eeda9bcc6a75f38f Mon Sep 17 00:00:00 2001
From: Jim Turner <github@turner.link>
Date: Sat, 2 Feb 2019 23:40:02 -0500
Subject: [PATCH 28/41] Make sum pairwise over all dimensions

---
 src/numeric/impl_numeric.rs | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/numeric/impl_numeric.rs b/src/numeric/impl_numeric.rs
index 15ddd789e..7c417a1aa 100644
--- a/src/numeric/impl_numeric.rs
+++ b/src/numeric/impl_numeric.rs
@@ -33,17 +33,10 @@ impl<A, S, D> ArrayBase<S, D>
         where A: Clone + Add<Output=A> + num_traits::Zero,
     {
         if let Some(slc) = self.as_slice_memory_order() {
-            return numeric_util::pairwise_sum(&slc)
-        }
-        let mut sum = A::zero();
-        for row in self.inner_rows() {
-            if let Some(slc) = row.as_slice() {
-                sum = sum + numeric_util::pairwise_sum(&slc);
-            } else {
-                sum = sum + numeric_util::iterator_pairwise_sum(row.iter());
-            }
+            numeric_util::pairwise_sum(&slc)
+        } else {
+            numeric_util::iterator_pairwise_sum(self.iter())
         }
-        sum
     }
 
     /// Return the sum of all elements in the array.

From 8301c25bb242bd788bbaf6359a30d2e286c3753a Mon Sep 17 00:00:00 2001
From: Jim Turner <github@turner.link>
Date: Sat, 2 Feb 2019 23:59:30 -0500
Subject: [PATCH 29/41] Implement contiguous sum_axis in terms of Zip

This is slightly faster and is easier to understand.
---
 src/numeric/impl_numeric.rs | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/numeric/impl_numeric.rs b/src/numeric/impl_numeric.rs
index 7c417a1aa..0dd273525 100644
--- a/src/numeric/impl_numeric.rs
+++ b/src/numeric/impl_numeric.rs
@@ -8,7 +8,6 @@
 
 use std::ops::{Add, Div, Mul};
 use num_traits::{self, Zero, Float, FromPrimitive};
-use itertools::free::enumerate;
 
 use crate::imp_prelude::*;
 use crate::numeric_util;
@@ -97,14 +96,12 @@ impl<A, S, D> ArrayBase<S, D>
               D: RemoveAxis,
     {
         let n = self.len_of(axis);
-        let stride = self.strides()[axis.index()];
-        if self.ndim() == 2 && stride == 1 {
+        if self.stride_of(axis) == 1 {
             // contiguous along the axis we are summing
             let mut res = Array::zeros(self.raw_dim().remove_axis(axis));
-            let ax = axis.index();
-            for (i, elt) in enumerate(&mut res) {
-                *elt = self.index_axis(Axis(1 - ax), i).sum();
-            }
+            Zip::from(&mut res)
+                .and(self.lanes(axis))
+                .apply(|sum, lane| *sum = lane.sum());
             res
         } else if self.len_of(axis) <= numeric_util::NAIVE_SUM_THRESHOLD {
             self.fold_axis(axis, A::zero(), |acc, x| acc.clone() + x.clone())

From 82453dfc73a217acc24fa85aa7f9108e28da5713 Mon Sep 17 00:00:00 2001
From: Jim Turner <github@turner.link>
Date: Sun, 3 Feb 2019 00:02:24 -0500
Subject: [PATCH 30/41] Remove redundant len_of call

---
 src/numeric/impl_numeric.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/numeric/impl_numeric.rs b/src/numeric/impl_numeric.rs
index 0dd273525..0be4a0ce0 100644
--- a/src/numeric/impl_numeric.rs
+++ b/src/numeric/impl_numeric.rs
@@ -103,7 +103,7 @@ impl<A, S, D> ArrayBase<S, D>
                 .and(self.lanes(axis))
                 .apply(|sum, lane| *sum = lane.sum());
             res
-        } else if self.len_of(axis) <= numeric_util::NAIVE_SUM_THRESHOLD {
+        } else if n <= numeric_util::NAIVE_SUM_THRESHOLD {
             self.fold_axis(axis, A::zero(), |acc, x| acc.clone() + x.clone())
         } else {
             let (v1, v2) = self.view().split_at(axis, n / 2);

From 978f45a29a11608c542994c6b029c893a3ff4afb Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Sun, 3 Feb 2019 16:30:04 +0000
Subject: [PATCH 31/41] Added test for axis independence

---
 src/numeric/impl_numeric.rs | 60 +++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/src/numeric/impl_numeric.rs b/src/numeric/impl_numeric.rs
index 0be4a0ce0..061b574db 100644
--- a/src/numeric/impl_numeric.rs
+++ b/src/numeric/impl_numeric.rs
@@ -277,3 +277,63 @@ impl<A, S, D> ArrayBase<S, D>
     }
 }
 
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use self::Array;
+    use quickcheck::quickcheck;
+
+    quickcheck! {
+        fn sum_value_does_not_depend_on_axis(xs: Vec<f64>) -> bool {
+            // We want three axis of equal length - we drop some elements
+            // to get the right number
+            let axis_length = (xs.len() as f64).cbrt().floor() as usize;
+            let xs = &xs[..axis_length.pow(3)];
+
+            // We want to check that summing with respect to an axis
+            // is independent from the specific underlying implementation of
+            // pairwise sum, which is itself conditional on the arrangement
+            // in memory of the array elements.
+            // We will thus swap axes and compute the sum, in turn, with respect to
+            // axes 0, 1 and 2, while making sure that mathematically the same
+            // number should be spit out (because we are properly transposing before summing).
+            if axis_length > 0 {
+                let a = Array::from_vec(xs.to_vec())
+                    .into_shape((axis_length, axis_length, axis_length))
+                    .unwrap();
+                assert!(a.is_standard_layout());
+                let sum1 = a.sum_axis(Axis(0));
+
+                let mut b = Array::zeros(a.raw_dim());
+                assert!(b.is_standard_layout());
+                for i in 0..axis_length {
+                    for j in 0..axis_length {
+                        for k in 0..axis_length {
+                            b[(i, j, k)] = a[(j, i, k)].clone();
+                        }
+                    }
+                }
+                let sum2 = b.sum_axis(Axis(1));
+
+                let mut c = Array::zeros(a.raw_dim());
+                assert!(c.is_standard_layout());
+                for i in 0..axis_length {
+                    for j in 0..axis_length {
+                        for k in 0..axis_length {
+                            c[(i, j, k)] = a[(k, i, j)].clone();
+                        }
+                    }
+                }
+                let sum3 = c.sum_axis(Axis(2));
+
+                let tol = 1e-10;
+                let first = (sum2.clone() - sum1.clone()).iter().all(|x| x.abs() < tol);
+                let second = (sum3.clone() - sum1.clone()).iter().all(|x| x.abs() < tol);
+                let third = (sum3.clone() - sum2.clone()).iter().all(|x| x.abs() < tol);
+                first && second && third
+            } else {
+                true
+            }
+        }
+    }
+}
\ No newline at end of file

From fa0ba30c25619c69efeaad9e30218ba84bde4ad7 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Sun, 3 Feb 2019 16:57:36 +0000
Subject: [PATCH 32/41] Make sure we actually exercise the pairwise technique

---
 Cargo.toml                  |   1 +
 src/lib.rs                  |   2 +
 src/numeric/impl_numeric.rs | 105 +++++++++++++++++++++---------------
 src/numeric_util.rs         |   7 ++-
 4 files changed, 72 insertions(+), 43 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 2f4f8bb0a..bdab7df52 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -47,6 +47,7 @@ serde = { version = "1.0", optional = true }
 defmac = "0.2"
 quickcheck = { version = "0.7.2", default-features = false }
 rawpointer = "0.1"
+rand = "0.5.5"
 
 [features]
 # Enable blas usage
diff --git a/src/lib.rs b/src/lib.rs
index d998be49f..39f102e6f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -104,6 +104,8 @@ extern crate num_integer;
 
 #[cfg(test)]
 extern crate quickcheck;
+#[cfg(test)]
+extern crate rand;
 
 #[cfg(feature = "docs")]
 pub mod doc;
diff --git a/src/numeric/impl_numeric.rs b/src/numeric/impl_numeric.rs
index 061b574db..261429651 100644
--- a/src/numeric/impl_numeric.rs
+++ b/src/numeric/impl_numeric.rs
@@ -280,60 +280,81 @@ impl<A, S, D> ArrayBase<S, D>
 #[cfg(test)]
 mod tests {
     use super::*;
+    use super::numeric_util::{NAIVE_SUM_THRESHOLD, UNROLL_SIZE};
     use self::Array;
-    use quickcheck::quickcheck;
+    use quickcheck::{QuickCheck, StdGen, TestResult};
 
-    quickcheck! {
-        fn sum_value_does_not_depend_on_axis(xs: Vec<f64>) -> bool {
-            // We want three axis of equal length - we drop some elements
-            // to get the right number
-            let axis_length = (xs.len() as f64).cbrt().floor() as usize;
-            let xs = &xs[..axis_length.pow(3)];
+    #[test]
+    fn test_sum_value_does_not_depend_on_axis() {
+        // `size` controls the length of the array of data
+        // We set it to be randomly drawn between 0 and
+        // a number larger than NAIVE_SUM_THRESHOLD * UNROLL_SIZE
+        let rng = StdGen::new(
+            rand::thread_rng(),
+            5* (NAIVE_SUM_THRESHOLD * UNROLL_SIZE).pow(3)
+        );
+        let mut quickcheck = QuickCheck::new().gen(rng).tests(100);
+        quickcheck.quickcheck(
+           _sum_value_does_not_depend_on_axis
+            as fn(
+               Vec<f64>
+            ) -> TestResult,
+        );
+    }
+
+    fn _sum_value_does_not_depend_on_axis(xs: Vec<f64>) -> TestResult {
+        // We want three axis of equal length - we drop some elements
+        // to get the right number
+        let axis_length = (xs.len() as f64).cbrt().floor() as usize;
+        let xs = &xs[..axis_length.pow(3)];
 
-            // We want to check that summing with respect to an axis
-            // is independent from the specific underlying implementation of
-            // pairwise sum, which is itself conditional on the arrangement
-            // in memory of the array elements.
-            // We will thus swap axes and compute the sum, in turn, with respect to
-            // axes 0, 1 and 2, while making sure that mathematically the same
-            // number should be spit out (because we are properly transposing before summing).
-            if axis_length > 0 {
-                let a = Array::from_vec(xs.to_vec())
-                    .into_shape((axis_length, axis_length, axis_length))
-                    .unwrap();
-                assert!(a.is_standard_layout());
-                let sum1 = a.sum_axis(Axis(0));
+        // We want to check that summing with respect to an axis
+        // is independent from the specific underlying implementation of
+        // pairwise sum, which is itself conditional on the arrangement
+        // in memory of the array elements.
+        // We will thus swap axes and compute the sum, in turn, with respect to
+        // axes 0, 1 and 2, while making sure that mathematically the same
+        // number should be spit out (because we are properly transposing before summing).
+        if axis_length > 0 {
+            let a = Array::from_vec(xs.to_vec())
+                .into_shape((axis_length, axis_length, axis_length))
+                .unwrap();
+            assert!(a.is_standard_layout());
+            let sum1 = a.sum_axis(Axis(0));
 
-                let mut b = Array::zeros(a.raw_dim());
-                assert!(b.is_standard_layout());
-                for i in 0..axis_length {
-                    for j in 0..axis_length {
-                        for k in 0..axis_length {
-                            b[(i, j, k)] = a[(j, i, k)].clone();
-                        }
+            let mut b = Array::zeros(a.raw_dim());
+            assert!(b.is_standard_layout());
+            for i in 0..axis_length {
+                for j in 0..axis_length {
+                    for k in 0..axis_length {
+                        b[(i, j, k)] = a[(j, i, k)].clone();
                     }
                 }
-                let sum2 = b.sum_axis(Axis(1));
+            }
+            let sum2 = b.sum_axis(Axis(1));
 
-                let mut c = Array::zeros(a.raw_dim());
-                assert!(c.is_standard_layout());
-                for i in 0..axis_length {
-                    for j in 0..axis_length {
-                        for k in 0..axis_length {
-                            c[(i, j, k)] = a[(k, i, j)].clone();
-                        }
+            let mut c = Array::zeros(a.raw_dim());
+            assert!(c.is_standard_layout());
+            for i in 0..axis_length {
+                for j in 0..axis_length {
+                    for k in 0..axis_length {
+                        c[(i, j, k)] = a[(k, i, j)].clone();
                     }
                 }
-                let sum3 = c.sum_axis(Axis(2));
+            }
+            let sum3 = c.sum_axis(Axis(2));
 
-                let tol = 1e-10;
-                let first = (sum2.clone() - sum1.clone()).iter().all(|x| x.abs() < tol);
-                let second = (sum3.clone() - sum1.clone()).iter().all(|x| x.abs() < tol);
-                let third = (sum3.clone() - sum2.clone()).iter().all(|x| x.abs() < tol);
-                first && second && third
+            let tol = 1e-10;
+            let first = (sum2.clone() - sum1.clone()).iter().all(|x| x.abs() < tol);
+            let second = (sum3.clone() - sum1.clone()).iter().all(|x| x.abs() < tol);
+            let third = (sum3.clone() - sum2.clone()).iter().all(|x| x.abs() < tol);
+            if first && second && third {
+                TestResult::passed()
             } else {
-                true
+                TestResult::failed()
             }
+        } else {
+            TestResult::passed()
         }
     }
 }
\ No newline at end of file
diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index 23966b9a9..2557f2c68 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -11,9 +11,14 @@ use num_traits::{self, Zero};
 use crate::LinalgScalar;
 
 /// Size threshold to switch to naive summation in all implementations of pairwise summation.
+#[cfg(not(test))]
 pub(crate) const NAIVE_SUM_THRESHOLD: usize = 64;
+// Set it to a smaller number for testing purposes
+#[cfg(test)]
+pub(crate) const NAIVE_SUM_THRESHOLD: usize = 2;
+
 /// Number of elements processed by unrolled operators (to leverage SIMD instructions).
-const UNROLL_SIZE: usize = 8;
+pub(crate) const UNROLL_SIZE: usize = 8;
 
 /// An implementation of pairwise summation for a vector slice.
 ///

From b4136d77a80110f71dc85ee0f2cd06228573c394 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Sun, 3 Feb 2019 17:15:31 +0000
Subject: [PATCH 33/41] Test discontinuous arrays

---
 src/numeric/impl_numeric.rs | 114 ++++++++++++++++++++++++++++--------
 1 file changed, 90 insertions(+), 24 deletions(-)

diff --git a/src/numeric/impl_numeric.rs b/src/numeric/impl_numeric.rs
index 261429651..b8c393372 100644
--- a/src/numeric/impl_numeric.rs
+++ b/src/numeric/impl_numeric.rs
@@ -281,7 +281,7 @@ impl<A, S, D> ArrayBase<S, D>
 mod tests {
     use super::*;
     use super::numeric_util::{NAIVE_SUM_THRESHOLD, UNROLL_SIZE};
-    use self::Array;
+    use self::{Array, s};
     use quickcheck::{QuickCheck, StdGen, TestResult};
 
     #[test]
@@ -316,38 +316,70 @@ mod tests {
         // axes 0, 1 and 2, while making sure that mathematically the same
         // number should be spit out (because we are properly transposing before summing).
         if axis_length > 0 {
-            let a = Array::from_vec(xs.to_vec())
-                .into_shape((axis_length, axis_length, axis_length))
-                .unwrap();
-            assert!(a.is_standard_layout());
-            let sum1 = a.sum_axis(Axis(0));
+            let (a, b, c) = equivalent_arrays(xs.to_vec(), axis_length);
 
-            let mut b = Array::zeros(a.raw_dim());
-            assert!(b.is_standard_layout());
-            for i in 0..axis_length {
-                for j in 0..axis_length {
-                    for k in 0..axis_length {
-                        b[(i, j, k)] = a[(j, i, k)].clone();
-                    }
-                }
-            }
+            let sum1 = a.sum_axis(Axis(0));
             let sum2 = b.sum_axis(Axis(1));
+            let sum3 = c.sum_axis(Axis(2));
 
-            let mut c = Array::zeros(a.raw_dim());
-            assert!(c.is_standard_layout());
-            for i in 0..axis_length {
-                for j in 0..axis_length {
-                    for k in 0..axis_length {
-                        c[(i, j, k)] = a[(k, i, j)].clone();
-                    }
-                }
+            let tol = 1e-10;
+            let first = (sum2.clone() - sum1.clone()).iter().all(|x| x.abs() < tol);
+            let second = (sum3.clone() - sum1.clone()).iter().all(|x| x.abs() < tol);
+            let third = (sum3.clone() - sum2.clone()).iter().all(|x| x.abs() < tol);
+
+            if first && second && third {
+                TestResult::passed()
+            } else {
+                TestResult::failed()
             }
-            let sum3 = c.sum_axis(Axis(2));
+        } else {
+            TestResult::passed()
+        }
+    }
+
+    #[test]
+    fn test_sum_value_does_not_depend_on_axis_with_discontinuous_array() {
+        // `size` controls the length of the array of data
+        // We set it to be randomly drawn between 0 and
+        // a number larger than NAIVE_SUM_THRESHOLD * UNROLL_SIZE
+        let rng = StdGen::new(
+            rand::thread_rng(),
+            5* (NAIVE_SUM_THRESHOLD * UNROLL_SIZE).pow(3)
+        );
+        let mut quickcheck = QuickCheck::new().gen(rng).tests(100);
+        quickcheck.quickcheck(
+            _sum_value_does_not_depend_on_axis_w_discontinuous_array
+                as fn(
+                Vec<f64>
+            ) -> TestResult,
+        );
+    }
+
+    fn _sum_value_does_not_depend_on_axis_w_discontinuous_array(xs: Vec<f64>) -> TestResult {
+        // We want three axis of equal length - we drop some elements
+        // to get the right number
+        let axis_length = (xs.len() as f64).cbrt().floor() as usize;
+        let xs = &xs[..axis_length.pow(3)];
+
+        // We want to check that summing with respect to an axis
+        // is independent from the specific underlying implementation of
+        // pairwise sum, which is itself conditional on the arrangement
+        // in memory of the array elements.
+        // We will thus swap axes and compute the sum, in turn, with respect to
+        // axes 0, 1 and 2, while making sure that mathematically the same
+        // number should be spit out (because we are properly transposing before summing).
+        if axis_length > 0 {
+            let (a, b, c) = equivalent_arrays(xs.to_vec(), axis_length);
+
+            let sum1 = a.slice(s![..;2, .., ..]).sum_axis(Axis(0));
+            let sum2 = b.slice(s![.., ..;2, ..]).sum_axis(Axis(1));
+            let sum3 = c.slice(s![.., .., ..;2]).sum_axis(Axis(2));
 
             let tol = 1e-10;
             let first = (sum2.clone() - sum1.clone()).iter().all(|x| x.abs() < tol);
             let second = (sum3.clone() - sum1.clone()).iter().all(|x| x.abs() < tol);
             let third = (sum3.clone() - sum2.clone()).iter().all(|x| x.abs() < tol);
+
             if first && second && third {
                 TestResult::passed()
             } else {
@@ -357,4 +389,38 @@ mod tests {
             TestResult::passed()
         }
     }
+
+    // Given a vector with axis_length^3 elements, it returns three arrays,
+    // built using the vector elements, such that (mathematically):
+    // a.sum_axis(Axis(0) == b.sum_axis(Axis(1)) == c.sum_axis(Axis(2))
+    fn equivalent_arrays(xs: Vec<f64>, axis_length: usize) -> (Array3<f64>, Array3<f64>, Array3<f64>) {
+        assert!(xs.len() == axis_length.pow(3));
+
+        let a = Array::from_vec(xs)
+            .into_shape((axis_length, axis_length, axis_length))
+            .unwrap();
+        assert!(a.is_standard_layout());
+
+        let mut b = Array::zeros(a.raw_dim());
+        assert!(b.is_standard_layout());
+        for i in 0..axis_length {
+            for j in 0..axis_length {
+                for k in 0..axis_length {
+                    b[(i, j, k)] = a[(j, i, k)].clone();
+                }
+            }
+        }
+
+        let mut c = Array::zeros(a.raw_dim());
+        assert!(c.is_standard_layout());
+        for i in 0..axis_length {
+            for j in 0..axis_length {
+                for k in 0..axis_length {
+                    c[(i, j, k)] = a[(k, i, j)].clone();
+                }
+            }
+        }
+        return (a, b, c)
+    }
+
 }
\ No newline at end of file

From 4a63cb3c5669c4463e45b302a983e5266e7a1691 Mon Sep 17 00:00:00 2001
From: LukeMathWalker <lucapalmieri1993@hotmail.com>
Date: Sun, 3 Feb 2019 17:26:52 +0000
Subject: [PATCH 34/41] Add more integer benchmark equivalents

---
 benches/numeric.rs | 103 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 96 insertions(+), 7 deletions(-)

diff --git a/benches/numeric.rs b/benches/numeric.rs
index 5a7c111a5..d94ad0835 100644
--- a/benches/numeric.rs
+++ b/benches/numeric.rs
@@ -25,21 +25,22 @@ fn clip(bench: &mut Bencher)
     });
 }
 
+
 #[bench]
-fn contiguous_sum_int_1e4(bench: &mut Bencher)
+fn contiguous_sum_1e7(bench: &mut Bencher)
 {
-    let n = 1e4 as usize;
-    let a = Array::from_vec((0..n).collect());
+    let n = 1e7 as usize;
+    let a = Array::linspace(-1e6, 1e6, n);
     bench.iter(|| {
         a.sum()
     });
 }
 
 #[bench]
-fn contiguous_sum_1e7(bench: &mut Bencher)
+fn contiguous_sum_int_1e7(bench: &mut Bencher)
 {
     let n = 1e7 as usize;
-    let a = Array::linspace(-1e6, 1e6, n);
+    let a = Array::from_vec((0..n).collect());
     bench.iter(|| {
         a.sum()
     });
@@ -55,6 +56,16 @@ fn contiguous_sum_1e4(bench: &mut Bencher)
     });
 }
 
+#[bench]
+fn contiguous_sum_int_1e4(bench: &mut Bencher)
+{
+    let n = 1e4 as usize;
+    let a = Array::from_vec((0..n).collect());
+    bench.iter(|| {
+        a.sum()
+    });
+}
+
 #[bench]
 fn contiguous_sum_1e2(bench: &mut Bencher)
 {
@@ -65,6 +76,16 @@ fn contiguous_sum_1e2(bench: &mut Bencher)
     });
 }
 
+#[bench]
+fn contiguous_sum_int_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::from_vec((0..n).collect());
+    bench.iter(|| {
+        a.sum()
+    });
+}
+
 #[bench]
 fn contiguous_sum_ix3_1e2(bench: &mut Bencher)
 {
@@ -75,6 +96,16 @@ fn contiguous_sum_ix3_1e2(bench: &mut Bencher)
     bench.iter(|| black_box(&a).sum());
 }
 
+#[bench]
+fn contiguous_sum_int_ix3_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::from_vec((0..n.pow(3)).collect())
+        .into_shape([n, n, n])
+        .unwrap();
+    bench.iter(|| black_box(&a).sum());
+}
+
 #[bench]
 fn inner_discontiguous_sum_ix3_1e2(bench: &mut Bencher)
 {
@@ -86,6 +117,17 @@ fn inner_discontiguous_sum_ix3_1e2(bench: &mut Bencher)
     bench.iter(|| black_box(&v).sum());
 }
 
+#[bench]
+fn inner_discontiguous_sum_int_ix3_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::from_vec((0..(n.pow(3) * 2)).collect())
+        .into_shape([n, n, 2*n])
+        .unwrap();
+    let v = a.slice(s![.., .., ..;2]);
+    bench.iter(|| black_box(&v).sum());
+}
+
 #[bench]
 fn middle_discontiguous_sum_ix3_1e2(bench: &mut Bencher)
 {
@@ -97,10 +139,21 @@ fn middle_discontiguous_sum_ix3_1e2(bench: &mut Bencher)
     bench.iter(|| black_box(&v).sum());
 }
 
+#[bench]
+fn middle_discontiguous_sum_int_ix3_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::from_vec((0..(n.pow(3) * 2)).collect())
+        .into_shape([n, 2*n, n])
+        .unwrap();
+    let v = a.slice(s![.., ..;2, ..]);
+    bench.iter(|| black_box(&v).sum());
+}
+
 #[bench]
 fn sum_by_row_1e4(bench: &mut Bencher)
 {
-    let n = 1e3 as usize;
+    let n = 1e4 as usize;
     let a = Array::linspace(-1e6, 1e6, n * n)
         .into_shape([n, n])
         .unwrap();
@@ -109,10 +162,22 @@ fn sum_by_row_1e4(bench: &mut Bencher)
     });
 }
 
+#[bench]
+fn sum_by_row_int_1e4(bench: &mut Bencher)
+{
+    let n = 1e4 as usize;
+    let a = Array::from_vec((0..n.pow(2)).collect())
+        .into_shape([n, n])
+        .unwrap();
+    bench.iter(|| {
+        a.sum_axis(Axis(0))
+    });
+}
+
 #[bench]
 fn sum_by_col_1e4(bench: &mut Bencher)
 {
-    let n = 1e3 as usize;
+    let n = 1e4 as usize;
     let a = Array::linspace(-1e6, 1e6, n * n)
         .into_shape([n, n])
         .unwrap();
@@ -121,6 +186,18 @@ fn sum_by_col_1e4(bench: &mut Bencher)
     });
 }
 
+#[bench]
+fn sum_by_col_int_1e4(bench: &mut Bencher)
+{
+    let n = 1e4 as usize;
+    let a = Array::from_vec((0..n.pow(2)).collect())
+        .into_shape([n, n])
+        .unwrap();
+    bench.iter(|| {
+        a.sum_axis(Axis(1))
+    });
+}
+
 #[bench]
 fn sum_by_middle_1e2(bench: &mut Bencher)
 {
@@ -132,3 +209,15 @@ fn sum_by_middle_1e2(bench: &mut Bencher)
         a.sum_axis(Axis(1))
     });
 }
+
+#[bench]
+fn sum_by_middle_int_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::from_vec((0..n.pow(3)).collect())
+        .into_shape([n, n, n])
+        .unwrap();
+    bench.iter(|| {
+        a.sum_axis(Axis(1))
+    });
+}

From f306b5f2812652a2763ae86520cce18328454366 Mon Sep 17 00:00:00 2001
From: Jim Turner <github@turner.link>
Date: Sun, 3 Feb 2019 18:11:37 -0500
Subject: [PATCH 35/41] Fix min_stride_axis to prefer axes with length > 1

---
 src/dimension/dimension_trait.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/dimension/dimension_trait.rs b/src/dimension/dimension_trait.rs
index f8357f069..ba42ead07 100644
--- a/src/dimension/dimension_trait.rs
+++ b/src/dimension/dimension_trait.rs
@@ -291,8 +291,8 @@ pub trait Dimension : Clone + Eq + Debug + Send + Sync + Default +
         indices
     }
 
-    /// Compute the minimum stride axis (absolute value), under the constraint
-    /// that the length of the axis is > 1;
+    /// Compute the minimum stride axis (absolute value), preferring axes with
+    /// length > 1.
     #[doc(hidden)]
     fn min_stride_axis(&self, strides: &Self) -> Axis {
         let n = match self.ndim() {
@@ -301,7 +301,7 @@ pub trait Dimension : Clone + Eq + Debug + Send + Sync + Default +
             n => n,
         };
         axes_of(self, strides)
-            .rev()
+            .filter(|ax| ax.len() > 1)
             .min_by_key(|ax| ax.stride().abs())
             .map_or(Axis(n - 1), |ax| ax.axis())
     }
@@ -588,9 +588,9 @@ impl Dimension for Dim<[Ix; 2]> {
 
     #[inline]
     fn min_stride_axis(&self, strides: &Self) -> Axis {
-        let s = get!(strides, 0) as Ixs;
-        let t = get!(strides, 1) as Ixs;
-        if s.abs() < t.abs() {
+        let s = (get!(strides, 0) as isize).abs();
+        let t = (get!(strides, 1) as isize).abs();
+        if s < t && get!(self, 0) > 1 {
             Axis(0)
         } else {
             Axis(1)

From b7951df25e396e3d8cfd9b1b674331314c937c45 Mon Sep 17 00:00:00 2001
From: Jim Turner <github@turner.link>
Date: Sun, 3 Feb 2019 18:34:13 -0500
Subject: [PATCH 36/41] Specialize min_stride_axis for Ix3

---
 src/dimension/dimension_trait.rs | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/dimension/dimension_trait.rs b/src/dimension/dimension_trait.rs
index ba42ead07..038e5473c 100644
--- a/src/dimension/dimension_trait.rs
+++ b/src/dimension/dimension_trait.rs
@@ -697,6 +697,23 @@ impl Dimension for Dim<[Ix; 3]> {
         Some(Ix3(i, j, k))
     }
 
+    #[inline]
+    fn min_stride_axis(&self, strides: &Self) -> Axis {
+        let s = (get!(strides, 0) as isize).abs();
+        let t = (get!(strides, 1) as isize).abs();
+        let u = (get!(strides, 2) as isize).abs();
+        let (argmin, min) = if t < u && get!(self, 1) > 1 {
+            (Axis(1), t)
+        } else {
+            (Axis(2), u)
+        };
+        if s < min && get!(self, 0) > 1 {
+            Axis(0)
+        } else {
+            argmin
+        }
+    }
+
     /// Self is an index, return the stride offset
     #[inline]
     fn stride_offset(index: &Self, strides: &Self) -> isize {

From 3326de4b8c80aaa4cbccc9068ede3dcc92d6036d Mon Sep 17 00:00:00 2001
From: Jim Turner <github@turner.link>
Date: Sun, 3 Feb 2019 18:09:09 -0500
Subject: [PATCH 37/41] Enable min_stride_axis as pub(crate) method

---
 src/impl_methods.rs | 46 +++++++++++++++++++++++++++++++++++++++++----
 tests/dimension.rs  | 31 ------------------------------
 2 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/src/impl_methods.rs b/src/impl_methods.rs
index de74af795..02e81736a 100644
--- a/src/impl_methods.rs
+++ b/src/impl_methods.rs
@@ -1619,12 +1619,11 @@ where
         axes_of(&self.dim, &self.strides)
     }
 
-    /*
-    /// Return the axis with the least stride (by absolute value)
-    pub fn min_stride_axis(&self) -> Axis {
+    /// Return the axis with the least stride (by absolute value),
+    /// preferring axes with len > 1.
+    pub(crate) fn min_stride_axis(&self) -> Axis {
         self.dim.min_stride_axis(&self.strides)
     }
-    */
 
     /// Return the axis with the greatest stride (by absolute value),
     /// preferring axes with len > 1.
@@ -2103,3 +2102,42 @@ where
         })
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::prelude::*;
+
+    #[test]
+    fn min_stride_axis() {
+        let a = Array1::<u8>::zeros(10);
+        assert_eq!(a.min_stride_axis(), Axis(0));
+
+        let a = Array2::<u8>::zeros((3, 3));
+        assert_eq!(a.min_stride_axis(), Axis(1));
+        assert_eq!(a.t().min_stride_axis(), Axis(0));
+
+        let a = ArrayD::<u8>::zeros(vec![3, 3]);
+        assert_eq!(a.min_stride_axis(), Axis(1));
+        assert_eq!(a.t().min_stride_axis(), Axis(0));
+
+        let min_axis = a.axes().min_by_key(|t| t.2.abs()).unwrap().axis();
+        assert_eq!(min_axis, Axis(1));
+
+        let mut b = ArrayD::<u8>::zeros(vec![2, 3, 4, 5]);
+        assert_eq!(b.min_stride_axis(), Axis(3));
+        for ax in 0..3 {
+            b.swap_axes(3, ax);
+            assert_eq!(b.min_stride_axis(), Axis(ax));
+            b.swap_axes(3, ax);
+        }
+        let mut v = b.view();
+        v.collapse_axis(Axis(3), 0);
+        assert_eq!(v.min_stride_axis(), Axis(2));
+
+        let a = Array2::<u8>::zeros((3, 3));
+        let v = a.broadcast((8, 3, 3)).unwrap();
+        assert_eq!(v.min_stride_axis(), Axis(0));
+        let v2 = a.broadcast((1, 3, 3)).unwrap();
+        assert_eq!(v2.min_stride_axis(), Axis(2));
+    }
+}
diff --git a/tests/dimension.rs b/tests/dimension.rs
index c76b8d7ad..11d6b3e0f 100644
--- a/tests/dimension.rs
+++ b/tests/dimension.rs
@@ -132,37 +132,6 @@ fn fastest_varying_order() {
 
 type ArrayF32<D> = Array<f32, D>;
 
-/*
-#[test]
-fn min_stride_axis() {
-    let a = ArrayF32::zeros(10);
-    assert_eq!(a.min_stride_axis(), Axis(0));
-
-    let a = ArrayF32::zeros((3, 3));
-    assert_eq!(a.min_stride_axis(), Axis(1));
-    assert_eq!(a.t().min_stride_axis(), Axis(0));
-
-    let a = ArrayF32::zeros(vec![3, 3]);
-    assert_eq!(a.min_stride_axis(), Axis(1));
-    assert_eq!(a.t().min_stride_axis(), Axis(0));
-
-    let min_axis = a.axes().min_by_key(|t| t.2.abs()).unwrap().axis();
-    assert_eq!(min_axis, Axis(1));
-
-    let mut b = ArrayF32::zeros(vec![2, 3, 4, 5]);
-    assert_eq!(b.min_stride_axis(), Axis(3));
-    for ax in 0..3 {
-        b.swap_axes(3, ax);
-        assert_eq!(b.min_stride_axis(), Axis(ax));
-        b.swap_axes(3, ax);
-    }
-
-    let a = ArrayF32::zeros((3, 3));
-    let v = a.broadcast((8, 3, 3)).unwrap();
-    assert_eq!(v.min_stride_axis(), Axis(0));
-}
-*/
-
 #[test]
 fn max_stride_axis() {
     let a = ArrayF32::zeros(10);

From 65b6046b0d7e5e1060b81b04d564decde8dc5e66 Mon Sep 17 00:00:00 2001
From: Jim Turner <github@turner.link>
Date: Sun, 3 Feb 2019 18:20:12 -0500
Subject: [PATCH 38/41] Simplify fold to use min_stride_axis

This causes no change in performance according to the relevant
benchmarks in `bench1`.
---
 src/impl_methods.rs | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/src/impl_methods.rs b/src/impl_methods.rs
index 02e81736a..2e2c632e9 100644
--- a/src/impl_methods.rs
+++ b/src/impl_methods.rs
@@ -1853,25 +1853,11 @@ where
         } else {
             let mut v = self.view();
             // put the narrowest axis at the last position
-            match v.ndim() {
-                0 | 1 => {}
-                2 => {
-                    if self.len_of(Axis(1)) <= 1
-                        || self.len_of(Axis(0)) > 1
-                            && self.stride_of(Axis(0)).abs() < self.stride_of(Axis(1)).abs()
-                    {
-                        v.swap_axes(0, 1);
-                    }
-                }
-                n => {
-                    let last = n - 1;
-                    let narrow_axis = v
-                        .axes()
-                        .filter(|ax| ax.len() > 1)
-                        .min_by_key(|ax| ax.stride().abs())
-                        .map_or(last, |ax| ax.axis().index());
-                    v.swap_axes(last, narrow_axis);
-                }
+            let n = v.ndim();
+            if n > 1 {
+                let last = n - 1;
+                let narrow_axis = self.min_stride_axis();
+                v.swap_axes(last, narrow_axis.index());
             }
             v.into_elements_base().fold(init, f)
         }

From b0b391aa878a2839a8952fe1d5c6bb8fff6ced2d Mon Sep 17 00:00:00 2001
From: Jim Turner <github@turner.link>
Date: Sun, 3 Feb 2019 18:31:30 -0500
Subject: [PATCH 39/41] Improve performance of sum in certain cases

This improves the performance of `sum` when an axis is contiguous but
the array as a whole is not contiguous.
---
 src/numeric/impl_numeric.rs | 13 ++++++++++---
 src/numeric_util.rs         |  2 +-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/numeric/impl_numeric.rs b/src/numeric/impl_numeric.rs
index b8c393372..cf2c0eafd 100644
--- a/src/numeric/impl_numeric.rs
+++ b/src/numeric/impl_numeric.rs
@@ -32,10 +32,17 @@ impl<A, S, D> ArrayBase<S, D>
         where A: Clone + Add<Output=A> + num_traits::Zero,
     {
         if let Some(slc) = self.as_slice_memory_order() {
-            numeric_util::pairwise_sum(&slc)
-        } else {
-            numeric_util::iterator_pairwise_sum(self.iter())
+            return numeric_util::pairwise_sum(&slc);
+        }
+        if self.ndim() > 1 {
+            let ax = self.dim.min_stride_axis(&self.strides);
+            if self.len_of(ax) >= numeric_util::UNROLL_SIZE && self.stride_of(ax) == 1 {
+                let partial_sums: Vec<_> =
+                    self.lanes(ax).into_iter().map(|lane| lane.sum()).collect();
+                return numeric_util::pure_pairwise_sum(&partial_sums);
+            }
         }
+        numeric_util::iterator_pairwise_sum(self.iter())
     }
 
     /// Return the sum of all elements in the array.
diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index 2557f2c68..855a1b803 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -79,7 +79,7 @@ where
 
 /// An implementation of pairwise summation for a vector slice that never
 /// switches to the naive sum algorithm.
-fn pure_pairwise_sum<A>(v: &[A]) -> A
+pub(crate) fn pure_pairwise_sum<A>(v: &[A]) -> A
     where
         A: Clone + Add<Output=A> + Zero,
 {

From 7f04e6fa56b945330278b3e9332b05611165d378 Mon Sep 17 00:00:00 2001
From: Jim Turner <github@turner.link>
Date: Sun, 3 Feb 2019 13:40:39 -0500
Subject: [PATCH 40/41] Update quickcheck and use quickcheck_macros

---
 Cargo.toml           |   3 +-
 src/dimension/mod.rs | 157 +++++++++++++++++++++----------------------
 src/lib.rs           |   2 +
 src/numeric_util.rs  |  10 ++-
 4 files changed, 86 insertions(+), 86 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index bdab7df52..ce3b563c7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -45,7 +45,8 @@ serde = { version = "1.0", optional = true }
 
 [dev-dependencies]
 defmac = "0.2"
-quickcheck = { version = "0.7.2", default-features = false }
+quickcheck = { version = "0.8.1", default-features = false }
+quickcheck_macros = "0.8"
 rawpointer = "0.1"
 rand = "0.5.5"
 
diff --git a/src/dimension/mod.rs b/src/dimension/mod.rs
index aebcfc662..719acc678 100644
--- a/src/dimension/mod.rs
+++ b/src/dimension/mod.rs
@@ -629,7 +629,8 @@ mod test {
     use crate::error::{from_kind, ErrorKind};
     use crate::slice::Slice;
     use num_integer::gcd;
-    use quickcheck::{quickcheck, TestResult};
+    use quickcheck::TestResult;
+    use quickcheck_macros::quickcheck;
 
     #[test]
     fn slice_indexing_uncommon_strides() {
@@ -738,30 +739,29 @@ mod test {
         can_index_slice::<(), _>(&[], &Ix2(0, 2), &Ix2(2, 1)).unwrap_err();
     }
 
-    quickcheck! {
-        fn can_index_slice_not_custom_same_as_can_index_slice(data: Vec<u8>, dim: Vec<usize>) -> bool {
-            let dim = IxDyn(&dim);
-            let result = can_index_slice_not_custom(&data, &dim);
-            if dim.size_checked().is_none() {
-                // Avoid overflow `dim.default_strides()` or `dim.fortran_strides()`.
-                result.is_err()
-            } else {
-                result == can_index_slice(&data, &dim, &dim.default_strides()) &&
-                    result == can_index_slice(&data, &dim, &dim.fortran_strides())
-            }
+    #[quickcheck]
+    fn can_index_slice_not_custom_same_as_can_index_slice(data: Vec<u8>, dim: Vec<usize>) -> bool {
+        let dim = IxDyn(&dim);
+        let result = can_index_slice_not_custom(&data, &dim);
+        if dim.size_checked().is_none() {
+            // Avoid overflow `dim.default_strides()` or `dim.fortran_strides()`.
+            result.is_err()
+        } else {
+            result == can_index_slice(&data, &dim, &dim.default_strides()) &&
+                result == can_index_slice(&data, &dim, &dim.fortran_strides())
         }
     }
 
-    quickcheck! {
-        fn extended_gcd_solves_eq(a: isize, b: isize) -> bool {
-            let (g, (x, y)) = extended_gcd(a, b);
-            a * x + b * y == g
-        }
+    #[quickcheck]
+    fn extended_gcd_solves_eq(a: isize, b: isize) -> bool {
+        let (g, (x, y)) = extended_gcd(a, b);
+        a * x + b * y == g
+    }
 
-        fn extended_gcd_correct_gcd(a: isize, b: isize) -> bool {
-            let (g, _) = extended_gcd(a, b);
-            g == gcd(a, b)
-        }
+    #[quickcheck]
+    fn extended_gcd_correct_gcd(a: isize, b: isize) -> bool {
+        let (g, _) = extended_gcd(a, b);
+        g == gcd(a, b)
     }
 
     #[test]
@@ -773,73 +773,72 @@ mod test {
         assert_eq!(extended_gcd(-5, 0), (5, (-1, 0)));
     }
 
-    quickcheck! {
-        fn solve_linear_diophantine_eq_solution_existence(
-            a: isize, b: isize, c: isize
-        ) -> TestResult {
-            if a == 0 || b == 0 {
-                TestResult::discard()
-            } else {
-                TestResult::from_bool(
-                    (c % gcd(a, b) == 0) == solve_linear_diophantine_eq(a, b, c).is_some()
-                )
-            }
+    #[quickcheck]
+    fn solve_linear_diophantine_eq_solution_existence(
+        a: isize, b: isize, c: isize
+    ) -> TestResult {
+        if a == 0 || b == 0 {
+            TestResult::discard()
+        } else {
+            TestResult::from_bool(
+                (c % gcd(a, b) == 0) == solve_linear_diophantine_eq(a, b, c).is_some()
+            )
         }
+    }
 
-        fn solve_linear_diophantine_eq_correct_solution(
-            a: isize, b: isize, c: isize, t: isize
-        ) -> TestResult {
-            if a == 0 || b == 0 {
-                TestResult::discard()
-            } else {
-                match solve_linear_diophantine_eq(a, b, c) {
-                    Some((x0, xd)) => {
-                        let x = x0 + xd * t;
-                        let y = (c - a * x) / b;
-                        TestResult::from_bool(a * x + b * y == c)
-                    }
-                    None => TestResult::discard(),
+    #[quickcheck]
+    fn solve_linear_diophantine_eq_correct_solution(
+        a: isize, b: isize, c: isize, t: isize
+    ) -> TestResult {
+        if a == 0 || b == 0 {
+            TestResult::discard()
+        } else {
+            match solve_linear_diophantine_eq(a, b, c) {
+                Some((x0, xd)) => {
+                    let x = x0 + xd * t;
+                    let y = (c - a * x) / b;
+                    TestResult::from_bool(a * x + b * y == c)
                 }
+                None => TestResult::discard(),
             }
         }
     }
 
-    quickcheck! {
-        fn arith_seq_intersect_correct(
-            first1: isize, len1: isize, step1: isize,
-            first2: isize, len2: isize, step2: isize
-        ) -> TestResult {
-            use std::cmp;
+    #[quickcheck]
+    fn arith_seq_intersect_correct(
+        first1: isize, len1: isize, step1: isize,
+        first2: isize, len2: isize, step2: isize
+    ) -> TestResult {
+        use std::cmp;
 
-            if len1 == 0 || len2 == 0 {
-                // This case is impossible to reach in `arith_seq_intersect()`
-                // because the `min*` and `max*` arguments are inclusive.
-                return TestResult::discard();
-            }
-            let len1 = len1.abs();
-            let len2 = len2.abs();
-
-            // Convert to `min*` and `max*` arguments for `arith_seq_intersect()`.
-            let last1 = first1 + step1 * (len1 - 1);
-            let (min1, max1) = (cmp::min(first1, last1), cmp::max(first1, last1));
-            let last2 = first2 + step2 * (len2 - 1);
-            let (min2, max2) = (cmp::min(first2, last2), cmp::max(first2, last2));
-
-            // Naively determine if the sequences intersect.
-            let seq1: Vec<_> = (0..len1)
-                .map(|n| first1 + step1 * n)
-                .collect();
-            let intersects = (0..len2)
-                .map(|n| first2 + step2 * n)
-                .any(|elem2| seq1.contains(&elem2));
-
-            TestResult::from_bool(
-                arith_seq_intersect(
-                    (min1, max1, if step1 == 0 { 1 } else { step1 }),
-                    (min2, max2, if step2 == 0 { 1 } else { step2 })
-                ) == intersects
-            )
+        if len1 == 0 || len2 == 0 {
+            // This case is impossible to reach in `arith_seq_intersect()`
+            // because the `min*` and `max*` arguments are inclusive.
+            return TestResult::discard();
         }
+        let len1 = len1.abs();
+        let len2 = len2.abs();
+
+        // Convert to `min*` and `max*` arguments for `arith_seq_intersect()`.
+        let last1 = first1 + step1 * (len1 - 1);
+        let (min1, max1) = (cmp::min(first1, last1), cmp::max(first1, last1));
+        let last2 = first2 + step2 * (len2 - 1);
+        let (min2, max2) = (cmp::min(first2, last2), cmp::max(first2, last2));
+
+        // Naively determine if the sequences intersect.
+        let seq1: Vec<_> = (0..len1)
+            .map(|n| first1 + step1 * n)
+            .collect();
+        let intersects = (0..len2)
+            .map(|n| first2 + step2 * n)
+            .any(|elem2| seq1.contains(&elem2));
+
+        TestResult::from_bool(
+            arith_seq_intersect(
+                (min1, max1, if step1 == 0 { 1 } else { step1 }),
+                (min2, max2, if step2 == 0 { 1 } else { step2 })
+            ) == intersects
+        )
     }
 
     #[test]
diff --git a/src/lib.rs b/src/lib.rs
index 39f102e6f..39402bf56 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -105,6 +105,8 @@ extern crate num_integer;
 #[cfg(test)]
 extern crate quickcheck;
 #[cfg(test)]
+extern crate quickcheck_macros;
+#[cfg(test)]
 extern crate rand;
 
 #[cfg(feature = "docs")]
diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index 855a1b803..68be6fa14 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -215,14 +215,12 @@ pub fn unrolled_eq<A>(xs: &[A], ys: &[A]) -> bool
 
 #[cfg(test)]
 mod tests {
-    use quickcheck::quickcheck;
+    use quickcheck_macros::quickcheck;
     use std::num::Wrapping;
     use super::iterator_pairwise_sum;
 
-    quickcheck! {
-        fn iterator_pairwise_sum_is_correct(xs: Vec<i32>) -> bool {
-            let xs: Vec<_> = xs.into_iter().map(|x| Wrapping(x)).collect();
-            iterator_pairwise_sum(xs.iter()) == xs.iter().sum()
-        }
+    #[quickcheck]
+    fn iterator_pairwise_sum_is_correct(xs: Vec<Wrapping<i32>>) -> bool {
+        iterator_pairwise_sum(xs.iter()) == xs.iter().sum()
     }
 }

From 1ed1a638e7ec86c37de36a30c21bdd12f92940c5 Mon Sep 17 00:00:00 2001
From: Jim Turner <github@turner.link>
Date: Sun, 3 Feb 2019 19:07:03 -0500
Subject: [PATCH 41/41] Clarify capacity calculation in iterator_pairwise_sum

---
 src/numeric_util.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index 68be6fa14..f6540fb92 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -62,7 +62,7 @@ where
     A: Clone + Add<Output=A> + Zero,
 {
     let (len, _) = iter.size_hint();
-    let cap = len.saturating_sub(1) / NAIVE_SUM_THRESHOLD + 1; // ceiling of division
+    let cap = len / NAIVE_SUM_THRESHOLD + if len % NAIVE_SUM_THRESHOLD != 0 { 1 } else { 0 };
     let mut partial_sums = Vec::with_capacity(cap);
     let (_, last_sum) = iter.fold((0, A::zero()), |(count, partial_sum), x| {
         if count < NAIVE_SUM_THRESHOLD {