From f232ad401113ac788054003c829a7a98a666fc7b Mon Sep 17 00:00:00 2001
From: Jonas Glesaaen <jonas@glesaaen.com>
Date: Mon, 14 May 2018 14:00:39 +0100
Subject: [PATCH 1/5] Moved all std library includes out of namespaces

This is necessary due to the fact that it breaks the C++ standard (see
item 20.5.2.2.3). This can and will therefore sometime lead to compilation
errors.
---
 include/qdp_sse_intrin.h                             |  3 ++-
 include/scalarsite_sse/sse_blas_local_sumsq_double.h |  4 +---
 include/scalarsite_sse/sse_dcomplex_mult_macros.h    |  5 ++++-
 include/scalarsite_sse/sse_spin_proj_inlines.h       |  2 +-
 lib/scalarsite_sse/sse_blas_local_sumsq_double.cc    |  4 +---
 lib/scalarsite_sse/sse_blas_local_vcdot_double.cc    |  8 +++++---
 lib/scalarsite_sse/sse_blas_vaxmbyz4_double.cc       |  5 ++---
 lib/scalarsite_sse/sse_blas_vaxmyz4_double.cc        |  4 +---
 lib/scalarsite_sse/sse_blas_vaxpbyz4_double.cc       |  5 ++---
 lib/scalarsite_sse/sse_blas_vaxpy4_double.cc         |  5 ++---
 lib/scalarsite_sse/sse_blas_vaypx4_double.cc         |  2 +-
 lib/scalarsite_sse/sse_blas_vscal4_double.cc         |  3 +--
 lib/scalarsite_sse/sse_linalg_m_eq_hh_double.cc      | 10 ++++++----
 lib/scalarsite_sse/sse_linalg_m_eq_hm_double.cc      |  9 ++++++---
 lib/scalarsite_sse/sse_linalg_m_eq_mh_double.cc      |  9 ++++++---
 lib/scalarsite_sse/sse_linalg_m_eq_mm_double.cc      |  9 ++++++---
 lib/scalarsite_sse/sse_linalg_m_eq_scal_m_double.cc  |  2 +-
 lib/scalarsite_sse/sse_linalg_m_peq_m_double.cc      |  2 +-
 18 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/include/qdp_sse_intrin.h b/include/qdp_sse_intrin.h
index 48f5ee5b9..9264fff4f 100644
--- a/include/qdp_sse_intrin.h
+++ b/include/qdp_sse_intrin.h
@@ -1,9 +1,10 @@
 #ifndef QDP_SSE_INTRIN_H
 #define QDP_SSE_INTRIN_H
 
+#include <xmmintrin.h>
+
 // Include the file with the SSE intrinsics  in it
 namespace QDP {
-#include <xmmintrin.h>
 typedef __m128 v4sf;
 
 typedef union { 
diff --git a/include/scalarsite_sse/sse_blas_local_sumsq_double.h b/include/scalarsite_sse/sse_blas_local_sumsq_double.h
index 8024d636a..e44affc0a 100644
--- a/include/scalarsite_sse/sse_blas_local_sumsq_double.h
+++ b/include/scalarsite_sse/sse_blas_local_sumsq_double.h
@@ -9,14 +9,12 @@
 #define QDP_SSE_BLAS_LOCAL_SUMSQ_DOUBLE
 
 #include "qdp_precision.h"
+#include <xmmintrin.h>
 
 namespace QDP {
 
-#include <xmmintrin.h>
-
   void local_sumsq4(REAL64 *sum, REAL64 *vecptr, int n_4spin);
 
-
 } // namespace QDP;
 
 #endif // guard
diff --git a/include/scalarsite_sse/sse_dcomplex_mult_macros.h b/include/scalarsite_sse/sse_dcomplex_mult_macros.h
index 0cea2706c..208273269 100644
--- a/include/scalarsite_sse/sse_dcomplex_mult_macros.h
+++ b/include/scalarsite_sse/sse_dcomplex_mult_macros.h
@@ -7,6 +7,10 @@
 /* SSE 2 Headers */
 #include<xmmintrin.h>
 
+#ifdef QDP_USE_SSE3
+#include <pmmintrin.h>
+#endif
+
 /* A useful union type allows me to set values into the 
    vector  from code */
 
@@ -122,7 +126,6 @@
 #else
 #warning "Using SSE3"
 /* SSE 3 */
-#include <pmmintrin.h>
 
 /* z = x*y    z, x, y are SSE registers containing complex numbers
               ordered with the real part in the low half, imag part 
diff --git a/include/scalarsite_sse/sse_spin_proj_inlines.h b/include/scalarsite_sse/sse_spin_proj_inlines.h
index b76fc82e4..897ada32a 100644
--- a/include/scalarsite_sse/sse_spin_proj_inlines.h
+++ b/include/scalarsite_sse/sse_spin_proj_inlines.h
@@ -2,13 +2,13 @@
 #define SSE_SPIN_PROJ_INLINES_H
 
 #include "qdp_sse_intrin.h"
+#include <stdio.h>
 
 /* File: generic_spin_proj_inlines.h
    Purpose: Supply inline functions to do spin projection
    Author: $Id: sse_spin_proj_inlines.h,v 1.6 2009-02-11 20:50:45 bjoo Exp $
 */
 namespace QDP {
-#include <stdio.h>
 
 /** \brief Spin Project (1/2)(1+\gamma_0)
  *
diff --git a/lib/scalarsite_sse/sse_blas_local_sumsq_double.cc b/lib/scalarsite_sse/sse_blas_local_sumsq_double.cc
index 2c611d700..8d73e3a6d 100644
--- a/lib/scalarsite_sse/sse_blas_local_sumsq_double.cc
+++ b/lib/scalarsite_sse/sse_blas_local_sumsq_double.cc
@@ -4,13 +4,11 @@
  *
  */
 
+#include <xmmintrin.h>
 #include "scalarsite_sse/sse_blas_local_sumsq_double.h"
 
 namespace QDP {
 
-#include <xmmintrin.h>
-
-
 // (Vector) out = (Scalar) (*scalep) * (Vector) InScale + (Vector) Add
 // #define DEBUG_VAXPY_DOUBLE
   void local_sumsq4(REAL64 *sum, REAL64 *vecptr, int n_4spin)
diff --git a/lib/scalarsite_sse/sse_blas_local_vcdot_double.cc b/lib/scalarsite_sse/sse_blas_local_vcdot_double.cc
index a2a77d2f8..22fba61a9 100644
--- a/lib/scalarsite_sse/sse_blas_local_vcdot_double.cc
+++ b/lib/scalarsite_sse/sse_blas_local_vcdot_double.cc
@@ -6,10 +6,13 @@
 
 #include <xmmintrin.h>
 #include "scalarsite_sse/sse_blas_local_vcdot_double.h"
+#include "qdp_config.h"
 
-namespace QDP {
+#ifdef QDP_USE_SSE3
+#include <pmmintrin.h>
+#endif
 
-#include "qdp_config.h"
+namespace QDP {
 
 #ifndef QDP_USE_SSE3
 
@@ -45,7 +48,6 @@ namespace QDP {
 #else
 #warning Using SSE3
   /* SSE 3 */
-#include <pmmintrin.h>
 
 #define CONJMUL(z,x,y)		\
   { \
diff --git a/lib/scalarsite_sse/sse_blas_vaxmbyz4_double.cc b/lib/scalarsite_sse/sse_blas_vaxmbyz4_double.cc
index 6b4eabdce..80cec5051 100644
--- a/lib/scalarsite_sse/sse_blas_vaxmbyz4_double.cc
+++ b/lib/scalarsite_sse/sse_blas_vaxmbyz4_double.cc
@@ -4,13 +4,12 @@
  *
  */
 
+#include <xmmintrin.h>
 #include "scalarsite_sse/sse_blas_vaxmbyz4_double.h"
+#include "scalarsite_sse/sse_prefetch.h"
 
 namespace QDP {
 
-#include <xmmintrin.h>
-#include "scalarsite_sse/sse_prefetch.h"
-
 #ifndef L2BY2
 #define L2BY2 1365          /* L2 / 2 in SPINORS */
 #endif
diff --git a/lib/scalarsite_sse/sse_blas_vaxmyz4_double.cc b/lib/scalarsite_sse/sse_blas_vaxmyz4_double.cc
index 7b69d3666..b43708a13 100644
--- a/lib/scalarsite_sse/sse_blas_vaxmyz4_double.cc
+++ b/lib/scalarsite_sse/sse_blas_vaxmyz4_double.cc
@@ -4,13 +4,11 @@
  *
  */
 
+#include <xmmintrin.h>
 #include "scalarsite_sse/sse_blas_vaxmyz4_double.h"
 
 namespace QDP {
 
-#include <xmmintrin.h>
-
-
 void vaxmyz4(REAL64 *Out,REAL64 *scalep,REAL64 *InScale, REAL64 *Add,int n_4vec)
 {
  __m128d scalar;
diff --git a/lib/scalarsite_sse/sse_blas_vaxpbyz4_double.cc b/lib/scalarsite_sse/sse_blas_vaxpbyz4_double.cc
index e48059d3d..81af4113e 100644
--- a/lib/scalarsite_sse/sse_blas_vaxpbyz4_double.cc
+++ b/lib/scalarsite_sse/sse_blas_vaxpbyz4_double.cc
@@ -4,13 +4,12 @@
  *
  */
 
+#include <xmmintrin.h>
 #include "scalarsite_sse/sse_blas_vaxpbyz4_double.h"
+#include "scalarsite_sse/sse_prefetch.h"
 
 namespace QDP {
 
-#include <xmmintrin.h>
-#include "scalarsite_sse/sse_prefetch.h"
-
 
 #ifndef L2BY2
 #define L2BY2 1365          /* L2 / 2 in SPINORS */
diff --git a/lib/scalarsite_sse/sse_blas_vaxpy4_double.cc b/lib/scalarsite_sse/sse_blas_vaxpy4_double.cc
index bf8342e5b..03c365b01 100644
--- a/lib/scalarsite_sse/sse_blas_vaxpy4_double.cc
+++ b/lib/scalarsite_sse/sse_blas_vaxpy4_double.cc
@@ -4,14 +4,13 @@
  *
  */
 
+#include <xmmintrin.h>
 #include "scalarsite_sse/sse_blas_vaxpy4_double.h"
+#include "scalarsite_sse/sse_prefetch.h"
 
 namespace QDP {
 
 
-#include <xmmintrin.h>
-#include "scalarsite_sse/sse_prefetch.h"
-
 #ifndef L2BY2
 #define L2BY2 1365          /* L2 / 2 in SPINORS */
 #endif
diff --git a/lib/scalarsite_sse/sse_blas_vaypx4_double.cc b/lib/scalarsite_sse/sse_blas_vaypx4_double.cc
index da8e14b28..dc4db5f5f 100644
--- a/lib/scalarsite_sse/sse_blas_vaypx4_double.cc
+++ b/lib/scalarsite_sse/sse_blas_vaypx4_double.cc
@@ -4,11 +4,11 @@
  *
  */
 
+#include <xmmintrin.h>
 #include "scalarsite_sse/sse_blas_vaypx4_double.h"
 
 namespace QDP {
 
-#include <xmmintrin.h>
 
 void vaypx4(REAL64 *Out,REAL64 *scalep,REAL64 *InScale, int n_4spin)
 {
diff --git a/lib/scalarsite_sse/sse_blas_vscal4_double.cc b/lib/scalarsite_sse/sse_blas_vscal4_double.cc
index a2399b87f..77fa018ac 100644
--- a/lib/scalarsite_sse/sse_blas_vscal4_double.cc
+++ b/lib/scalarsite_sse/sse_blas_vscal4_double.cc
@@ -4,12 +4,11 @@
  *
  */
 
+#include <xmmintrin.h>
 #include "scalarsite_sse/sse_blas_vscal4_double.h"
 
 namespace QDP {
 
-#include <xmmintrin.h>
-
 
 void vscal4(REAL64 *z,REAL64 *a,REAL64 *x, int n_4spin)
 {
diff --git a/lib/scalarsite_sse/sse_linalg_m_eq_hh_double.cc b/lib/scalarsite_sse/sse_linalg_m_eq_hh_double.cc
index 0139147dd..998a3b8c9 100644
--- a/lib/scalarsite_sse/sse_linalg_m_eq_hh_double.cc
+++ b/lib/scalarsite_sse/sse_linalg_m_eq_hh_double.cc
@@ -4,18 +4,21 @@
  *
  */
 
+#include <xmmintrin.h>
 #include "scalarsite_sse/sse_linalg_mm_su3_double.h"
+#include "qdp_config.h"
 
-namespace QDP {
+#ifdef QDP_USE_SSE3
+#include <pmmintrin.h>
+#endif
 
-#include <xmmintrin.h>
+namespace QDP {
 
 typedef union {
   __m128d v;
   double  d[2];
 } VD;
 
-#include "qdp_config.h"
 
 #ifndef QDP_USE_SSE3
 
@@ -54,7 +57,6 @@ typedef union {
 #else
 #warning Using SSE3
   /* SSE 3 */
-#include <pmmintrin.h>
 #define CCMUL(z,x,y)		\
   { \
     __m128d t1; \
diff --git a/lib/scalarsite_sse/sse_linalg_m_eq_hm_double.cc b/lib/scalarsite_sse/sse_linalg_m_eq_hm_double.cc
index 7f040aa0f..9307eb2bc 100644
--- a/lib/scalarsite_sse/sse_linalg_m_eq_hm_double.cc
+++ b/lib/scalarsite_sse/sse_linalg_m_eq_hm_double.cc
@@ -4,12 +4,16 @@
  *
  */
 
+#include <xmmintrin.h>
 #include "scalarsite_sse/sse_linalg_mm_su3_double.h"
+#include "qdp_config.h"
+
+#ifdef QDP_USE_SSE3
+#include <pmmintrin.h>
+#endif
 
 namespace QDP {
 
-#include <xmmintrin.h>
-#include "qdp_config.h"
 
 #ifndef QDP_USE_SSE3
 
@@ -45,7 +49,6 @@ namespace QDP {
 #else
 #warning Using SSE3
   /* SSE 3 */
-#include <pmmintrin.h>
 
 #define CONJMUL(z,x,y)		\
   { \
diff --git a/lib/scalarsite_sse/sse_linalg_m_eq_mh_double.cc b/lib/scalarsite_sse/sse_linalg_m_eq_mh_double.cc
index a253f5be5..da845c7a8 100644
--- a/lib/scalarsite_sse/sse_linalg_m_eq_mh_double.cc
+++ b/lib/scalarsite_sse/sse_linalg_m_eq_mh_double.cc
@@ -4,12 +4,16 @@
  *
  */
 
+#include <xmmintrin.h>
 #include "scalarsite_sse/sse_linalg_mm_su3_double.h"
+#include "qdp_config.h"
+
+#ifdef QDP_USE_SSE3
+#include <pmmintrin.h>
+#endif
 
 namespace QDP {
 
-#include <xmmintrin.h>
-#include "qdp_config.h"
 
 #ifndef QDP_USE_SSE3
 
@@ -45,7 +49,6 @@ namespace QDP {
 #else
 #warning Using SSE3
   /* SSE 3 */
-#include <pmmintrin.h>
 
 #define CONJMUL(z,x,y)		\
   { \
diff --git a/lib/scalarsite_sse/sse_linalg_m_eq_mm_double.cc b/lib/scalarsite_sse/sse_linalg_m_eq_mm_double.cc
index 6b0023ff8..f6a3d1b74 100644
--- a/lib/scalarsite_sse/sse_linalg_m_eq_mm_double.cc
+++ b/lib/scalarsite_sse/sse_linalg_m_eq_mm_double.cc
@@ -4,13 +4,17 @@
  *
  */
 
+#include <xmmintrin.h>
 #include "scalarsite_sse/sse_linalg_mm_su3_double.h"
+#include "qdp_config.h"
+
+#ifdef QDP_USE_SSE3
+#include <pmmintrin.h>
+#endif
 
 namespace QDP {
 
-#include <xmmintrin.h>
 
-#include "qdp_config.h"
 #ifndef QDP_USE_SSE3
 
   // c = x*y;
@@ -46,7 +50,6 @@ namespace QDP {
 
 #else 
 #warning Using SSE3
-#include <pmmintrin.h>
   // Use SSE3
 
 #define CMUL(z,x,y)		\
diff --git a/lib/scalarsite_sse/sse_linalg_m_eq_scal_m_double.cc b/lib/scalarsite_sse/sse_linalg_m_eq_scal_m_double.cc
index 10efd2f96..5cfcfed20 100644
--- a/lib/scalarsite_sse/sse_linalg_m_eq_scal_m_double.cc
+++ b/lib/scalarsite_sse/sse_linalg_m_eq_scal_m_double.cc
@@ -4,11 +4,11 @@
  *
  */
 
+#include <xmmintrin.h>
 #include "scalarsite_sse/sse_linalg_mm_su3_double.h"
 
 namespace QDP {
 
-#include <xmmintrin.h>
 
   /* M = a*M  a is scalar */
   void ssed_m_eq_scal_m(REAL64* m2, REAL64* a, REAL64 *m1, int n_mat)
diff --git a/lib/scalarsite_sse/sse_linalg_m_peq_m_double.cc b/lib/scalarsite_sse/sse_linalg_m_peq_m_double.cc
index bbac06b74..eca330c0b 100644
--- a/lib/scalarsite_sse/sse_linalg_m_peq_m_double.cc
+++ b/lib/scalarsite_sse/sse_linalg_m_peq_m_double.cc
@@ -4,11 +4,11 @@
  *
  */
 
+#include <xmmintrin.h>
 #include "scalarsite_sse/sse_linalg_mm_su3_double.h"
 
 namespace QDP {
 
-#include <xmmintrin.h>
 
   typedef union { 
     double c[2];

From e8c5017b2f081c8a26773022a2a0391e5473970d Mon Sep 17 00:00:00 2001
From: Jonas Glesaaen <jonas@glesaaen.com>
Date: Mon, 14 May 2018 14:06:57 +0100
Subject: [PATCH 2/5] Moved all includes outside of namespaces in include root

One should in general not have includes inside of namespaces unless
strictly necessary (which it isn't in this case).
---
 include/qdp_config.h             | 4 ++++
 include/qdp_params.h             | 5 ++---
 include/qdp_parscalar_specific.h | 4 +++-
 include/qdp_precision.h          | 4 ++++
 include/qdp_scalar_specific.h    | 4 ++++
 include/qdp_scalarsite_defs.h    | 6 +++---
 include/qdp_scalarvecsite_defs.h | 6 +++---
 7 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/include/qdp_config.h b/include/qdp_config.h
index 23ccae6a5..64f412511 100644
--- a/include/qdp_config.h
+++ b/include/qdp_config.h
@@ -14,6 +14,8 @@
 /* Include the stuff generated by autoconf */
 #include "qdp_config_internal.h"
 
+namespace QDP {
+
 /* Prefix everything with QDP_ */
 static const char* const QDP_PACKAGE(PACKAGE);
 static const char* const QDP_PACKAGE_BUGREPORT(PACKAGE_BUGREPORT);
@@ -23,6 +25,8 @@ static const char* const QDP_PACKAGE_TARNAME(PACKAGE_TARNAME);
 static const char* const QDP_PACKAGE_VERSION(PACKAGE_VERSION);
 static const char* const QDP_VERSION(VERSION);
 
+} // namespace QDP 
+
 
 /* Undef the unwanted */
 #undef PACKAGE
diff --git a/include/qdp_params.h b/include/qdp_params.h
index d0b8042e3..c81e43bf6 100644
--- a/include/qdp_params.h
+++ b/include/qdp_params.h
@@ -7,8 +7,9 @@
 #ifndef QDP_PARAMS_H
 #define QDP_PARAMS_H
 
-namespace QDP {
+#include <qdp_config.h>
 
+namespace QDP {
 
 /*! @defgroup params Fundamental parameters for QDP
  *
@@ -19,8 +20,6 @@ namespace QDP {
  * @{
  */
 
-#include <qdp_config.h>
-
 const int Nd = QDP_ND;
 const int Nc = QDP_NC;
 const int Ns = QDP_NS;
diff --git a/include/qdp_parscalar_specific.h b/include/qdp_parscalar_specific.h
index e5842f124..b345a7019 100644
--- a/include/qdp_parscalar_specific.h
+++ b/include/qdp_parscalar_specific.h
@@ -323,10 +323,12 @@ void evaluate_userfunc(int lo, int hi, int myId, user_arg<T,T1,Op,RHS> *a)
 	 }
 }
 
+} // namespace QDP
+
 //! include the header file for dispatch
 #include "qdp_dispatch.h"
 
-
+namespace QDP {
 
 //-----------------------------------------------------------------------------
 //! OLattice Op Scalar(Expression(source)) under an Subset
diff --git a/include/qdp_precision.h b/include/qdp_precision.h
index d85fa0cb8..01ec77053 100644
--- a/include/qdp_precision.h
+++ b/include/qdp_precision.h
@@ -11,6 +11,8 @@
 // Fix Definitions
 #include <qdp_config.h>
 
+namespace QDP {
+
 // Fix default precision
 #if ! defined(BASE_PRECISION)
 #define BASE_PRECISION 32
@@ -35,6 +37,8 @@ typedef REAL64    DOUBLE;
 typedef REAL64    REAL;
 typedef REAL64    DOUBLE;
 
+} // namespace QDP 
+
 #define INNER_LOG 1
 
 #else
diff --git a/include/qdp_scalar_specific.h b/include/qdp_scalar_specific.h
index 372a90308..6568b58fd 100644
--- a/include/qdp_scalar_specific.h
+++ b/include/qdp_scalar_specific.h
@@ -139,9 +139,13 @@ void evaluate_userfunc(int lo, int hi, int myId, user_arg<T,T1,Op,RHS> *a)
    }
 }
 
+} // namespace QDP 
+
 //! include the header file for dispatch
 #include "qdp_dispatch.h"
 
+namespace QDP {
+
 //-----------------------------------------------------------------------------
 //! OLattice Op Scalar(Expression(source)) under an Subset
 /*! 
diff --git a/include/qdp_scalarsite_defs.h b/include/qdp_scalarsite_defs.h
index 4c368d05e..0a3ace630 100644
--- a/include/qdp_scalarsite_defs.h
+++ b/include/qdp_scalarsite_defs.h
@@ -7,6 +7,9 @@
 #ifndef QDP_SCALARSITE_DEFS_H
 #define QDP_SCALARSITE_DEFS_H
 
+#include <qdp_config.h>
+#include "qdp_precision.h"
+
 namespace QDP {
 
 /*! \addtogroup defs Type definitions
@@ -18,9 +21,6 @@ namespace QDP {
  * @{
  */
 
-#include <qdp_config.h>
-#include "qdp_precision.h"
-
 //----------------------------------------------------------------------
 //! Gamma matrices are conveniently defined for this Ns
 typedef GammaType<Ns> Gamma;
diff --git a/include/qdp_scalarvecsite_defs.h b/include/qdp_scalarvecsite_defs.h
index 2f6b7e40c..460dc0c8a 100644
--- a/include/qdp_scalarvecsite_defs.h
+++ b/include/qdp_scalarvecsite_defs.h
@@ -7,6 +7,9 @@
 #ifndef QDP_SCALARVECSITE_DEFS_H
 #define QDP_SCALARVECSITE_DEFS_H
 
+#include <qdp_config.h>
+#include "qdp_precision.h"
+
 namespace QDP {
 
 
@@ -19,9 +22,6 @@ namespace QDP {
  * @{
  */
 
-#include <qdp_config.h>
-#include "qdp_precision.h"
-
 //----------------------------------------------------------------------
 //! Gamma matrices are conveniently defined for this Ns
 typedef GammaType<Ns> Gamma;

From 408c9b43f2e99a4285524e0c722fdd787aac89d2 Mon Sep 17 00:00:00 2001
From: Jonas Glesaaen <jonas@glesaaen.com>
Date: Mon, 14 May 2018 14:08:40 +0100
Subject: [PATCH 3/5] Moved includes outside of namespaces in
 include/scalarsite_generic

---
 include/qdp_scalarsite_pabasm.h               | 19 +++----
 .../qdp_generic_fused_spin_proj_evaluates.h   | 13 ++---
 ...eneric_fused_spin_proj_evaluates_wrapper.h | 13 +++--
 .../qdp_generic_fused_spin_recon_evaluates.h  | 11 +---
 ...neric_fused_spin_recon_evaluates_wrapper.h | 15 ++---
 .../qdp_generic_spin_project_evaluates.h      | 11 ++--
 ...p_generic_spin_project_evaluates_wrapper.h |  6 +-
 .../qdp_scalarsite_generic_blas.h             | 10 +++-
 .../qdp_scalarsite_generic_blas_g5.h          | 12 ++--
 .../qdp_scalarsite_generic_blas_g5_wrapper.h  | 57 +++++++++----------
 .../qdp_scalarsite_generic_blas_wrapper.h     | 20 +++----
 .../qdp_scalarsite_generic_cblas.h            | 10 +++-
 .../qdp_scalarsite_generic_cblas_wrapper.h    | 15 +++--
 .../qdp_scalarsite_generic_linalg.h           | 39 ++++++-------
 .../qdp_scalarsite_generic_linalg_wrapper.h   | 12 ++--
 15 files changed, 136 insertions(+), 127 deletions(-)

diff --git a/include/qdp_scalarsite_pabasm.h b/include/qdp_scalarsite_pabasm.h
index 84143a32b..2cd9f4adf 100644
--- a/include/qdp_scalarsite_pabasm.h
+++ b/include/qdp_scalarsite_pabasm.h
@@ -11,6 +11,15 @@
 
 #warning "Using PABASM Scalarsite"
 
+#include "scalarsite_generic/generic_mult_nn.h"
+#include "scalarsite_generic/generic_mult_na.h"
+#include "scalarsite_generic/generic_mult_an.h"
+#include "scalarsite_generic/generic_mult_aa.h"
+#include "scalarsite_generic/generic_mat_vec.h"
+#include "scalarsite_generic/generic_adj_mat_vec.h"
+#include "scalarsite_generic/generic_addvec.h"
+
+
 namespace QDP {
 
 /*! @defgroup optimizations  Optimizations
@@ -23,16 +32,6 @@ namespace QDP {
 // Use this def just to safe some typing later on in the file
 #define RComplexFloat  RComplex<float>
 
-
-#include "scalarsite_generic/generic_mult_nn.h"
-#include "scalarsite_generic/generic_mult_na.h"
-#include "scalarsite_generic/generic_mult_an.h"
-#include "scalarsite_generic/generic_mult_aa.h"
-#include "scalarsite_generic/generic_mat_vec.h"
-#include "scalarsite_generic/generic_adj_mat_vec.h"
-#include "scalarsite_generic/generic_addvec.h"
-
-
 // #define QDP_SCALARSITE_DEBUG
 
 // Optimized version of  
diff --git a/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates.h b/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates.h
index 04de0f05f..4c0dc785d 100644
--- a/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates.h
+++ b/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates.h
@@ -1,14 +1,15 @@
 #ifndef QDP_GENERIC_FUSED_SPIN_PROJ_EVALUATES_H
 #define QDP_GENERIC_FUSED_SPIN_PROJ_EVALUATES_H
 
-
-/* Evaluates for things like adj(u)*spinProjectDir0Plus(y) */
-using namespace QDP;
 namespace QDP {
-
 typedef PScalar< PColorMatrix< RComplex<REAL>, 3> > SU3Mat;
+} // namespace QDP;
 
+// ther wrappers for the functions to be threaded
+#include "qdp_generic_fused_spin_proj_evaluates_wrapper.h"
 
+/* Evaluates for things like adj(u)*spinProjectDir0Plus(y) */
+namespace QDP {
 
 ////////////////////////////////
 // Threading evaluates
@@ -16,10 +17,6 @@ typedef PScalar< PColorMatrix< RComplex<REAL>, 3> > SU3Mat;
 // by Xu Guo, EPCC, 28 August, 2008
 ////////////////////////////////
 
-// ther wrappers for the functions to be threaded
-#include "qdp_generic_fused_spin_proj_evaluates_wrapper.h"
-
-
 // HalfVec = adj(u)*SpinProjectDir0Plus(Vec);
 template<>
 inline
diff --git a/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates_wrapper.h b/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates_wrapper.h
index d128a2ee9..3d4a9ff95 100644
--- a/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates_wrapper.h
+++ b/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates_wrapper.h
@@ -1,6 +1,8 @@
 #ifndef QDP_GENERIC_FUSED_SPIN_PROJ_EVALUATES_WRAPPER_H
 #define QDP_GENERIC_FUSED_SPIN_PROJ_EVALUATES_WRAPPER_H
 
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates wrappers
 //
@@ -33,7 +35,7 @@ void ordered_fused_spin_proj_evaluate_function (int lo, int hi, int myId, ordere
   for (int site = low; site < high; site++){
      HVec tmp;
      func( (REAL *)&(a.elem(site).elem(0).elem(0).real()),(REAL *)&(tmp.elem(0).elem(0).real()), 1);
-      
+
      _inline_mult_adj_su3_mat_vec(u.elem(site).elem(), tmp.elem(0), d.elem(site).elem(0));
      _inline_mult_adj_su3_mat_vec(u.elem(site).elem(), tmp.elem(1), d.elem(site).elem(1));   
   }
@@ -63,18 +65,17 @@ void unordered_fused_spin_proj_evaluate_function (int lo, int hi, int myId, unor
 
   for (int j = lo; j < hi; j++){
     int site = tab[j];
-      
+
     HVec tmp;
     func( (REAL *)&(a.elem(site).elem(0).elem(0).real()), (REAL *)&(tmp.elem(0).elem(0).real()), 1);
-      
-      
+
+
     _inline_mult_adj_su3_mat_vec(u.elem(site).elem(), tmp.elem(0), d.elem(site).elem(0));
     _inline_mult_adj_su3_mat_vec(u.elem(site).elem(), tmp.elem(1), d.elem(site).elem(1));   
   }
 
 }
 
-
-
+} // namespace QDP;
 
 #endif
diff --git a/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates.h b/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates.h
index 3859c696c..e333123bf 100644
--- a/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates.h
+++ b/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates.h
@@ -1,8 +1,10 @@
 #ifndef QDP_GENERIC_FUSED_SPIN_RECON_EVALUATES_H
 #define QDP_GENERIC_FUSED_SPIN_RECON_EVALUATES_H
 
-namespace QDP {
+// the wrappers for the functions to be threaded
+#include "qdp_generic_fused_spin_recon_evaluates_wrapper.h"
 
+namespace QDP {
 
 ////////////////////////////////
 // Threading evaluates
@@ -10,9 +12,6 @@ namespace QDP {
 // by Xu Guo, EPCC, 28 August, 2008
 ////////////////////////////////
 
-// the wrappers for the functions to be threaded
-#include "qdp_generic_fused_spin_recon_evaluates_wrapper.h"
-
 
 // Vec = SpinReconstructDir0Plus( u * psi);
 template<>
@@ -1210,8 +1209,4 @@ void evaluate(OLattice< FVec >& d,
 
 } // namespace QDP;
 
-
-
-
-
 #endif
diff --git a/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates_wrapper.h b/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates_wrapper.h
index 241d48b6c..cda7d8fdf 100644
--- a/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates_wrapper.h
+++ b/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates_wrapper.h
@@ -1,6 +1,8 @@
 #ifndef QDP_GENERIC_FUSED_SPIN_RECON_EVALUATES_WRAPPER_H
 #define QDP_GENERIC_FUSED_SPIN_RECON_EVALUATES_WRAPPER_H
 
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates wrappers
 //
@@ -34,8 +36,8 @@ void ordered_fused_spin_recon_evaluate_function (int lo, int hi, int myId, order
       HVec tmp;
       _inline_mult_su3_mat_vec(u.elem(site).elem(), a.elem(site).elem(0), tmp.elem(0));
       _inline_mult_su3_mat_vec(u.elem(site).elem(), a.elem(site).elem(1), tmp.elem(1));
-      
-      
+
+
       func( (REAL *)&(tmp.elem(0).elem(0).real()), (REAL *)&(d.elem(site).elem(0).elem(0).real()), 1);
   }
 
@@ -65,19 +67,18 @@ void unordered_fused_spin_recon_evaluate_function (int lo, int hi, int myId, uno
   for (int j = lo; j < hi; j++){
 
     int site = tab[j];
-      
+
     HVec tmp;
     _inline_mult_su3_mat_vec(u.elem(site).elem(), a.elem(site).elem(0), tmp.elem(0));
     _inline_mult_su3_mat_vec(u.elem(site).elem(), a.elem(site).elem(1), tmp.elem(1));
-      
-      
+
+
     func( (REAL *)&(tmp.elem(0).elem(0).real()), (REAL *)&(d.elem(site).elem(0).elem(0).real()),1);
   }
 
 }
 
-
-
+} // namespace QDP;
 
 
 #endif
diff --git a/include/scalarsite_generic/qdp_generic_spin_project_evaluates.h b/include/scalarsite_generic/qdp_generic_spin_project_evaluates.h
index 1904522e5..da6b3a89d 100644
--- a/include/scalarsite_generic/qdp_generic_spin_project_evaluates.h
+++ b/include/scalarsite_generic/qdp_generic_spin_project_evaluates.h
@@ -1,7 +1,6 @@
 #ifndef QDP_GENERIC_SPIN_PROJECT_EVALUTATES_H
 #define QDP_GENERIC_SPIN_PROJECT_EVALUTATES_H
 
-using namespace QDP;
 namespace QDP {
 
 // Typedefs
@@ -15,15 +14,19 @@ typedef REAL SpinColFull[4][3][2];
 typedef REAL SpinColHalf[2][3][2];
 // d = SpinProjectDir0Plus(Vec);
 
+} // namespace QDP;
+
+// the wrappers for the functions to be threaded
+#include "qdp_generic_spin_project_evaluates_wrapper.h"
+
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates
 //
 // by Xu Guo, EPCC, 28 August, 2008
 ////////////////////////////////
 
-// the wrappers for the functions to be threaded
-#include "qdp_generic_spin_project_evaluates_wrapper.h"
-
 
 template<class A, class B>
 inline
diff --git a/include/scalarsite_generic/qdp_generic_spin_project_evaluates_wrapper.h b/include/scalarsite_generic/qdp_generic_spin_project_evaluates_wrapper.h
index 73c9b3335..10d4b0ef7 100644
--- a/include/scalarsite_generic/qdp_generic_spin_project_evaluates_wrapper.h
+++ b/include/scalarsite_generic/qdp_generic_spin_project_evaluates_wrapper.h
@@ -1,6 +1,7 @@
 #ifndef QDP_GENERIC_SPIN_PROJECT_EVALUATES_WRAPPER_H
 #define QDP_GENERIC_SPIN_PROJECT_EVALUATES_WRAPPER_H
 
+namespace QDP {
 
 ////////////////////////////////
 // Threading evaluates wrappers
@@ -64,9 +65,6 @@ void unordered_spin_project_evaluate_function (int lo, int hi, int myId, unorder
 
 }
 
-
-
-
-
+} // namespace QDP 
 
 #endif
diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_blas.h b/include/scalarsite_generic/qdp_scalarsite_generic_blas.h
index bbb99cbe9..26fb3035e 100644
--- a/include/scalarsite_generic/qdp_scalarsite_generic_blas.h
+++ b/include/scalarsite_generic/qdp_scalarsite_generic_blas.h
@@ -28,15 +28,19 @@ namespace QDP {
 typedef PSpinVector<PColorVector<RComplex<REAL>, 3>, 4> TVec;
 typedef PScalar<PScalar<RScalar<REAL> > >  TScal;
 
+} // namespace QDP;
+
+// the wrappers for the functions to be threaded
+#include "qdp_scalarsite_generic_blas_wrapper.h"
+
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates
 //
 // by Xu Guo, EPCC, 12 August, 2008
 ////////////////////////////////
 
-// the wrappers for the functions to be threaded
-#include "qdp_scalarsite_generic_blas_wrapper.h"
-
 
 // #define DEBUG_BLAS
 // TVec is the LatticeFermion from qdp_dwdefs.h with the OLattice<> stripped
diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5.h b/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5.h
index 7e5389e8f..9d43ce18c 100644
--- a/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5.h
+++ b/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5.h
@@ -17,8 +17,6 @@
 #include "scalarsite_generic/generic_blas_vaxpby3_g5.h"
 #include "scalarsite_generic/generic_blas_g5.h"
 
-using namespace QDP;
-
 namespace QDP {
 
 // Types needed for the expression templates. 
@@ -26,15 +24,19 @@ namespace QDP {
 typedef PSpinVector<PColorVector<RComplex<REAL>, 3>, Ns> TVec;
 typedef PScalar<PScalar<RScalar<REAL> > >  TScal;
 
+} // namespace QDP 
+
+// the wrappers for the functions to be threaded
+#include "qdp_scalarsite_generic_blas_g5_wrapper.h"
+
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates
 //
 // by Xu Guo, EPCC, 26 August, 2008
 ////////////////////////////////
 
-// the wrappers for the functions to be threaded
-#include "qdp_scalarsite_generic_blas_g5_wrapper.h"
-
 
 // #define DEBUG_BLAS_G6
 // TVec is the LatticeFermion from qdp_dwdefs.h with the OLattice<> stripped
diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5_wrapper.h b/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5_wrapper.h
index fd2e452fe..db677db2c 100644
--- a/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5_wrapper.h
+++ b/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5_wrapper.h
@@ -1,6 +1,8 @@
 #ifndef QDP_SCALARSITE_GENERIC_BLAS_G5_WRAPPER_H
 #define QDP_SCALARSITE_GENERIC_BLAS_G5_WRAPPER_H
 
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates wrappers
 //
@@ -70,7 +72,7 @@ void unordered_vaypx3_g5_y_evaluate_function (int lo, int hi, int myId, unordere
   int Ns = a->Ns;
   const int* tab = a->tab;
   void (*func)(REAL*, REAL*, REAL*, REAL*, int) = a->func;
-  
+
   for(int j=lo; j < hi; j++) { 
     int i=tab[j];
     REAL* xptr = (REAL *)&(x.elem(i).elem(0).elem(0).real());
@@ -78,7 +80,7 @@ void unordered_vaypx3_g5_y_evaluate_function (int lo, int hi, int myId, unordere
     func(yptr, aptr, yptr, xptr, Ns);
   }
 }
- 
+
 
 // structure for vaypx3_g5 of NOT having order (with z )
 struct unordered_vaypx3_g5_z_user_arg{
@@ -90,7 +92,7 @@ struct unordered_vaypx3_g5_z_user_arg{
 				 int Ns_,
 				 const int* tab_,
 				 void (*func_)(REAL*, REAL*, REAL*, REAL*, int)) : x(x_),y(y_),d(d_),aptr(aptr_), Ns(Ns_), tab(tab_),func(func_) {}
-				 
+
   const OLattice< TVec >& x;
   const OLattice< TVec >& y;
   OLattice< TVec >& d;
@@ -111,14 +113,14 @@ void unordered_vaypx3_g5_z_evaluate_function (int lo, int hi, int myId, unordere
   int Ns = a->Ns;
   const int* tab = a->tab;
   void (*func)(REAL*, REAL*, REAL*, REAL*, int) = a->func;
-  
+
   for(int j=lo; j < hi; j++) { 
     int i=tab[j];
 
     REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
     REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real());
     REAL* zptr =  &(d.elem(i).elem(0).elem(0).real());
-      
+
     // Get the no of 3vecs. s.start() and s.end() are inclusive so add +1
     func(zptr, aptr, xptr, yptr, Ns);
   }
@@ -181,7 +183,7 @@ void unordered_vadd3_g5_evaluate_function (int lo, int hi, int myId, unordered_v
   int Ns = a->Ns;
   const int* tab = a->tab;
   void (*func)(REAL*, REAL*, REAL*, int) = a->func;
-  
+
   for(int j=lo; j < hi; j++) { 
     int i=tab[j];
     REAL* xptr = (REAL *)&(x.elem(i).elem(0).elem(0).real());
@@ -236,7 +238,7 @@ struct unordered_vaxpy3_g5_user_arg{
 			       int Ns_,
 			       const int* tab_,
 			       void (*func_)(REAL*, REAL*, REAL*, REAL*, int)) : x(x_), y(y_),d(d_),aptr(aptr_),Ns(Ns_), tab(tab_), func(func_) {}
-  
+
   const OLattice< TVec >& x;
   const OLattice< TVec >& y;
   OLattice< TVec >& d;
@@ -257,13 +259,13 @@ void unordered_vaxpy3_g5_evaluate_function (int lo, int hi, int myId, unordered_
   int Ns = a->Ns;
   const int* tab = a->tab;
   void (*func)(REAL*, REAL*, REAL*, REAL*, int) = a->func;
-  
+
   for(int j=lo; j < hi; j++) { 
     int i=tab[j];
     REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
     REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real());
     REAL* zptr =  &(d.elem(i).elem(0).elem(0).real());
-      
+
     func(zptr, aptr, xptr, yptr, Ns);
   }
 }
@@ -297,7 +299,7 @@ void ordered_vscal_g5_evaluate_function (int lo, int hi, int myId, ordered_vscal
   int index = lo * 24;
   Out = &Out[index];
   In = &In[index];
- 
+
   func(Out, scalep, In, n_4vec);
 
 }
@@ -330,12 +332,12 @@ void unordered_vscal_g5_evaluate_function (int lo, int hi, int myId, unordered_v
   int Ns = a->Ns;
   const int* tab = a->tab;
   void (*func)(REAL*, REAL*, REAL*, int) = a->func;
-  
+
   for(int j=lo; j < hi; j++) { 
     int i=tab[j];
     REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
     REAL* zptr =  &(d.elem(i).elem(0).elem(0).real());
-      
+
     func(zptr, aptr, xptr, Ns);
   }
 }
@@ -414,13 +416,13 @@ void unordered_vaxpby3_g5_evaluate_function (int lo, int hi, int myId, unordered
   int Ns = a->Ns;
   const int* tab = a->tab;
   void (*func)(REAL*, REAL*, REAL*, REAL*, REAL*, int) = a->func;
-  
+
   for(int j=lo; j < hi; j++) { 
     int i=tab[j];
     REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
     REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real());
     REAL* zptr =  &(d.elem(i).elem(0).elem(0).real());
-      
+
     func(zptr, aptr, xptr, bptr, yptr, Ns);
   }
 }
@@ -453,7 +455,7 @@ void ordered_scal_g5_evaluate_function (int lo, int hi, int myId, ordered_scal_g
   int index = lo * 24;
   Out = &Out[index];
   In = &In[index];
- 
+
   scal_g5(Out, scalep, In, n_4vec);
 
 }
@@ -482,12 +484,12 @@ void unordered_scal_g5_evaluate_function (int lo, int hi, int myId, unordered_sc
   REAL* aptr = a->aptr;
   int Ns = a->Ns;
   const int* tab = a->tab;
- 
+
   for(int j=lo; j < hi; j++) { 
     int i=tab[j];
     REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
     REAL* zptr =  &(d.elem(i).elem(0).elem(0).real());
-      
+
     scal_g5(zptr, aptr, xptr, Ns);
   }
 }
@@ -562,13 +564,13 @@ void unordered_xOpayz_g5_evaluate_function (int lo, int hi, int myId, unordered_
   int Ns = a->Ns;
   const int* tab = a->tab;
   void (*func)(REAL*, REAL*, REAL*, REAL*, int) = a->func;
-  
+
   for(int j=lo; j < hi; j++) { 
     int i=tab[j];
     REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
     REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real());
     REAL* zptr =  &(d.elem(i).elem(0).elem(0).real());
-      
+
     func(zptr, aptr, xptr, yptr, Ns);
   }
 }
@@ -645,13 +647,13 @@ void unordered_axOpbyz_g5_evaluate_function (int lo, int hi, int myId, unordered
   int Ns = a->Ns;
   const int* tab = a->tab;
   void (*func)(REAL*, REAL*, REAL*, REAL*, REAL*, int) = a->func;
-  
+
   for(int j=lo; j < hi; j++) { 
     int i=tab[j];
     REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
     REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real());
     REAL* zptr =  &(d.elem(i).elem(0).elem(0).real());
-      
+
     func(zptr, aptr, xptr, bptr, yptr, Ns);
   }
 }
@@ -724,13 +726,13 @@ void unordered_xOpayz_ig5_y_evaluate_function (int lo, int hi, int myId, unorder
   int Ns = a->Ns;
   const int* tab = a->tab;
   void (*func)(REAL*, REAL*, REAL*, REAL*, int) = a->func;
-  
+
   for(int j=lo; j < hi; j++) { 
     int i=tab[j];
     REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
     REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real());
     REAL* zptr =  &(d.elem(i).elem(0).elem(0).real());
-      
+
     func(zptr, aptr, xptr, yptr, Ns);
   }
 }
@@ -765,19 +767,16 @@ void unordered_xOpayz_ig5_z_evaluate_function (int lo, int hi, int myId, unorder
   int Ns = a->Ns;
   const int* tab = a->tab;
   void (*func)(REAL*, REAL*, REAL*, REAL*, int) = a->func;
-  
+
   for(int j=lo; j < hi; j++) { 
     int i=tab[j];
     REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
     REAL* zptr =  &(d.elem(i).elem(0).elem(0).real());
-      
+
     func(zptr, aptr, zptr, xptr, Ns);
   }
 }
 
-
-
-
-
+} // namespace QDP;
 
 #endif
diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_blas_wrapper.h b/include/scalarsite_generic/qdp_scalarsite_generic_blas_wrapper.h
index acfa37c46..6aecf2dd1 100644
--- a/include/scalarsite_generic/qdp_scalarsite_generic_blas_wrapper.h
+++ b/include/scalarsite_generic/qdp_scalarsite_generic_blas_wrapper.h
@@ -1,6 +1,8 @@
 #ifndef QDP_SCALARSITE_GENERIC_BLAS_WRAPPER_H
 #define QDP_SCALARSITE_GENERIC_BLAS_WRAPPER_H
 
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates wrappers
 //
@@ -79,7 +81,7 @@ void unordered_vaxpy3_y_evaluate_function (int lo, int hi, int myId, unordered_v
      vaxpy3(yptr, scalep, yptr, xptr, 1);
    }
   }
- 
+
 
 }
 
@@ -246,7 +248,7 @@ void ordered_vscal_evaluate_function (int lo, int hi, int myId, ordered_vscal_us
 // structure for vscal of NOT having order
 struct unordered_vscal_user_arg {
   unordered_vscal_user_arg(
-  
+
 			   const OLattice< TVec >& x_,
 			   OLattice< TVec >& d_,
 			   REAL* scalep_,
@@ -341,10 +343,10 @@ void unordered_vaxpby3_evaluate_function (int lo, int hi, int myId, unordered_va
      REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
      REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real());
      REAL* zptr =  &(d.elem(i).elem(0).elem(0).real());
-      
+
      // Get the no of 3vecs. s.start() and s.end() are inclusive so add +1
      vaxpby3(zptr, aptr, xptr, bptr, yptr, 1);
-  
+
    }
 
 }
@@ -415,18 +417,14 @@ void unordered_vaxmby3_evaluate_function (int lo, int hi, int myId, unordered_va
      REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
      REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real());
      REAL* zptr =  &(d.elem(i).elem(0).elem(0).real());
-      
+
      // Get the no of 3vecs. s.start() and s.end() are inclusive so add +1
      vaxmby3(zptr, aptr, xptr, bptr, yptr,1);
-  
+
    }
 
 }
 
-
-
-
-
-
+} // namespace QDP 
 
 #endif
diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_cblas.h b/include/scalarsite_generic/qdp_scalarsite_generic_cblas.h
index 9b659af8c..66b7ec4b9 100644
--- a/include/scalarsite_generic/qdp_scalarsite_generic_cblas.h
+++ b/include/scalarsite_generic/qdp_scalarsite_generic_cblas.h
@@ -14,15 +14,19 @@ namespace QDP {
 typedef PScalar<PScalar<RComplex<REAL> > >  CScal;
 typedef PSpinVector<PColorVector<RComplex<REAL>, 3>, 4> CTVec;
 
+} // namespace QDP;
+
+// the wrappers for the functions to be threaded
+#include "qdp_scalarsite_generic_cblas_wrapper.h"
+
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates
 //
 // by Xu Guo, EPCC, 26 August, 2008
 ////////////////////////////////
 
-// the wrappers for the functions to be threaded
-#include "qdp_scalarsite_generic_cblas_wrapper.h"
-
 // vector z *= complex a
 template<>
 inline
diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_cblas_wrapper.h b/include/scalarsite_generic/qdp_scalarsite_generic_cblas_wrapper.h
index 07aec4ab6..aebe89702 100644
--- a/include/scalarsite_generic/qdp_scalarsite_generic_cblas_wrapper.h
+++ b/include/scalarsite_generic/qdp_scalarsite_generic_cblas_wrapper.h
@@ -1,6 +1,8 @@
 #ifndef QDP_SCALARSITE_GENERIC_CBLAS_WRAPPER_H
 #define QDP_SCALARSITE_GENERIC_CBLAS_WRAPPER_H
 
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates wrappers
 //
@@ -120,12 +122,12 @@ void unordered_vcaxpy3_y_evaluate_function (int lo, int hi, int myId, unordered_
 
   for(int j=lo; j < hi; j++) { 
      int i=tab[j];
-   
+
      REAL* xptr = (REAL *)&(x.elem(i).elem(0).elem(0).real());
      REAL* yptr = &(d.elem(i).elem(0).elem(0).real());
 
      vcaxpy3(yptr, scalep, xptr, yptr, 1);
-    
+
    }
 
 }
@@ -157,7 +159,7 @@ void unordered_vcaxpy3_z_evaluate_function (int lo, int hi, int myId, unordered_
 
   for(int j=lo; j < hi; j++) { 
      int i=tab[j];
-   
+
      REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
      REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real());
      REAL *zptr =          &(d.elem(i).elem(0).elem(0).real());
@@ -228,7 +230,7 @@ void unordered_vcaxmy3_evaluate_function (int lo, int hi, int myId, unordered_vc
 
   for(int j=lo; j < hi; j++) { 
      int i=tab[j];
-   
+
      REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
      REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real());
      REAL *zptr =          &(d.elem(i).elem(0).elem(0).real());
@@ -302,7 +304,7 @@ void unordered_vcaxpby3_evaluate_function (int lo, int hi, int myId, unordered_v
 
   for(int j=lo; j < hi; j++) { 
      int i=tab[j];
-   
+
      REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
      REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real());
      REAL *zptr =          &(d.elem(i).elem(0).elem(0).real());
@@ -376,7 +378,7 @@ void unordered_vcaxmby3_evaluate_function (int lo, int hi, int myId, unordered_v
 
   for(int j=lo; j < hi; j++) { 
      int i=tab[j];
-   
+
      REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real());
      REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real());
      REAL *zptr =          &(d.elem(i).elem(0).elem(0).real());
@@ -387,6 +389,7 @@ void unordered_vcaxmby3_evaluate_function (int lo, int hi, int myId, unordered_v
 
 }
 
+} // namespace QDP 
 
 
 #endif
diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_linalg.h b/include/scalarsite_generic/qdp_scalarsite_generic_linalg.h
index 36256ccb9..e1dbd685a 100644
--- a/include/scalarsite_generic/qdp_scalarsite_generic_linalg.h
+++ b/include/scalarsite_generic/qdp_scalarsite_generic_linalg.h
@@ -10,6 +10,14 @@
 #ifndef QDP_SCALARSITE_GENERIC_LINALG_H
 #define QDP_SCALARSITE_GENERIC_LINALG_H
 
+#include "scalarsite_generic/generic_mult_nn.h"
+#include "scalarsite_generic/generic_mult_na.h"
+#include "scalarsite_generic/generic_mult_an.h"
+#include "scalarsite_generic/generic_mult_aa.h"
+#include "scalarsite_generic/generic_mat_vec.h"
+// #include "scalarsite_generic/generic_adj_mat_vec.h" -- No longer used."
+#include "scalarsite_generic/generic_addvec.h"
+
 namespace QDP {
 
 /*! @defgroup optimizations  Optimizations
@@ -22,16 +30,6 @@ namespace QDP {
 // Use this def just to safe some typing later on in the file
 typedef RComplex<REAL>  RComplexFloat;
 
-
-#include "scalarsite_generic/generic_mult_nn.h"
-#include "scalarsite_generic/generic_mult_na.h"
-#include "scalarsite_generic/generic_mult_an.h"
-#include "scalarsite_generic/generic_mult_aa.h"
-#include "scalarsite_generic/generic_mat_vec.h"
-// #include "scalarsite_generic/generic_adj_mat_vec.h" -- No longer used."
-#include "scalarsite_generic/generic_addvec.h"
-
-
 // #define QDP_SCALARSITE_DEBUG
 
 // Optimized version of  
@@ -452,6 +450,7 @@ operator+(const PScalar<PColorVector<RComplexFloat,3> >& l,
   return d;
 }
 
+} // namespace QDP;
 
 #if 1
 
@@ -464,6 +463,8 @@ operator+(const PScalar<PColorVector<RComplexFloat,3> >& l,
 // the wrappers for the function to be threaded
 #include "qdp_scalarsite_generic_linalg_wrapper.h"
 
+namespace QDP {
+
 // Specialization to optimize the case   
 //    LatticeHalfFermion = LatticeColorMatrix * LatticeHalfFermion
 // NOTE: let this be a subroutine to save space
@@ -493,13 +494,13 @@ void evaluate(OLattice<PSpinVector<PColorVector<RComplexFloat, 3>, 2> >& d,
   const H& r = static_cast<const H&>(rhs.expression().right());
 
   if( s.hasOrderedRep() ) { 
-    
+
     int totalSize = s.end() - s.start() + 1;
-    
+
     int base = s.start();
-    
+
     ordered_linalg_user_arg a(d, l, r, base);
-    
+
     dispatch_to_threads(totalSize, a, ordered_linalg_evaluate_userfunc);
 
     ////////////////////
@@ -507,7 +508,7 @@ void evaluate(OLattice<PSpinVector<PColorVector<RComplexFloat, 3>, 2> >& d,
     ////////////////////
     // Ordered Way - loop through sites and save a table lookup
     //for(int i=s.start(); i <= s.end(); i++) { 
-      
+
     //_inline_generic_mult_su3_mat_vec(l.elem(i).elem(),
     //			       r.elem(i).elem(0),
     //			       d.elem(i).elem(0));
@@ -526,14 +527,14 @@ void evaluate(OLattice<PSpinVector<PColorVector<RComplexFloat, 3>, 2> >& d,
     unordered_linalg_user_arg arg(d, l, r, tab);
 
     dispatch_to_threads(totalSize, arg, unordered_linalg_evaluate_userfunc);
-    
+
     ////////////////////
     // Original code
     ////////////////////
     // Unordered Way - do a site table lookup
     //for(int j=0; j < s.numSiteTable(); j++) { 
     //int i = tab[j];
-      
+
     //_inline_generic_mult_su3_mat_vec(l.elem(i).elem(),
     //			       r.elem(i).elem(0),
     //			       d.elem(i).elem(0));
@@ -544,6 +545,8 @@ void evaluate(OLattice<PSpinVector<PColorVector<RComplexFloat, 3>, 2> >& d,
   }
 }
 
+} // namespace QDP;
+
 #endif
 
 /*! @} */   // end of group optimizations
@@ -552,6 +555,4 @@ void evaluate(OLattice<PSpinVector<PColorVector<RComplexFloat, 3>, 2> >& d,
 #undef QDP_SCALARSITE_DEBUG
 #endif
 
-} // namespace QDP;
-
 #endif
diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_linalg_wrapper.h b/include/scalarsite_generic/qdp_scalarsite_generic_linalg_wrapper.h
index c1da599ed..a9ae35906 100644
--- a/include/scalarsite_generic/qdp_scalarsite_generic_linalg_wrapper.h
+++ b/include/scalarsite_generic/qdp_scalarsite_generic_linalg_wrapper.h
@@ -1,6 +1,8 @@
 #ifndef QDP_SCALARSITE_GENERIC_LINALG_WRAPPER_H
 #define QDP_SCALARSITE_GENERIC_LINALG_WRAPPER_H
 
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates wrappers
 //
@@ -24,7 +26,7 @@ struct ordered_linalg_user_arg{
 			  const C& l_,
 			  const H& r_,
 			  int base_) : d(d_), l(l_),r(r_), base(base_) {}
-			  
+
 };
 
 //! user function for the evaluate function in the ordered situation
@@ -43,7 +45,7 @@ void ordered_linalg_evaluate_userfunc(int lo, int hi, int myId, ordered_linalg_u
 
    // Ordered Way - loop through sites and save a table lookup
    for(int i=low; i < high; i++) { 
-      
+
      _inline_generic_mult_su3_mat_vec(l.elem(i).elem(),
 				      r.elem(i).elem(0),
 				      d.elem(i).elem(0));
@@ -51,7 +53,7 @@ void ordered_linalg_evaluate_userfunc(int lo, int hi, int myId, ordered_linalg_u
 				      r.elem(i).elem(1),
 				      d.elem(i).elem(1));
    }
-  
+
 }
 
 //! user argument for the evaluate function in the unordered situation
@@ -86,7 +88,7 @@ void unordered_linalg_evaluate_userfunc(int lo, int hi, int myId,  unordered_lin
    // Unordered Way - do a site table lookup
    for(int j=lo; j < hi; j++) { 
      int i = tab[j];
-      
+
      _inline_generic_mult_su3_mat_vec(l.elem(i).elem(),
 				      r.elem(i).elem(0),
 				      d.elem(i).elem(0));
@@ -96,5 +98,7 @@ void unordered_linalg_evaluate_userfunc(int lo, int hi, int myId,  unordered_lin
    } 
 }
 
+} // namespace QDP;
+
 
 #endif

From 7ca5ebb3cc3cede0100edf1763769f791727d9d8 Mon Sep 17 00:00:00 2001
From: Jonas Glesaaen <jonas@glesaaen.com>
Date: Mon, 14 May 2018 14:09:09 +0100
Subject: [PATCH 4/5] Moves includes outside of headers in
 include/scalarsite_generic

---
 .../scalarsite_sse/qdp_scalarsite_sse_blas.h  |  8 +++++--
 .../qdp_scalarsite_sse_blas_double.h          | 13 ++++++-----
 .../qdp_scalarsite_sse_blas_double_wrapper.h  | 15 +++++++------
 .../qdp_scalarsite_sse_blas_g5.h              |  2 --
 .../qdp_scalarsite_sse_blas_wrapper.h         | 12 ++++++----
 .../qdp_scalarsite_sse_linalg.h               | 22 +++++++++----------
 .../qdp_sse_fused_spin_proj_evaluates.h       | 13 ++++++-----
 ...dp_sse_fused_spin_proj_evaluates_wrapper.h | 12 +++++-----
 ...p_sse_fused_spin_recon_evaluates_wrapper.h | 14 +++++++-----
 .../scalarsite_sse/qdp_sse_spin_evaluates.h   | 12 +++++-----
 .../qdp_sse_spin_evaluates_wrapper.h          |  5 +++--
 include/scalarsite_sse/sse_fused_spin_proj.h  |  1 -
 12 files changed, 73 insertions(+), 56 deletions(-)

diff --git a/include/scalarsite_sse/qdp_scalarsite_sse_blas.h b/include/scalarsite_sse/qdp_scalarsite_sse_blas.h
index 5668c48ac..ccee33e52 100644
--- a/include/scalarsite_sse/qdp_scalarsite_sse_blas.h
+++ b/include/scalarsite_sse/qdp_scalarsite_sse_blas.h
@@ -34,14 +34,18 @@ namespace QDP {
   typedef PSpinVector<PColorVector<RComplex<REAL32>, 3>, 4> TVec;
   typedef PScalar<PScalar<RScalar<REAL32> > >  TScal;
 
+} // namespace QDP;
+
+// the wrappers for the functions to be threaded
+#include "qdp_scalarsite_sse_blas_wrapper.h"
+
+namespace QDP {
 
   ////////////////////////////////
   // Threading evaluates
   //
   // by Xu Guo, EPCC, 6 October, 2008
   ////////////////////////////////
-  // the wrappers for the functions to be threaded
-#include "qdp_scalarsite_sse_blas_wrapper.h"
   
 
 
diff --git a/include/scalarsite_sse/qdp_scalarsite_sse_blas_double.h b/include/scalarsite_sse/qdp_scalarsite_sse_blas_double.h
index 97baf9524..f8a1198f2 100644
--- a/include/scalarsite_sse/qdp_scalarsite_sse_blas_double.h
+++ b/include/scalarsite_sse/qdp_scalarsite_sse_blas_double.h
@@ -37,6 +37,14 @@ namespace QDP {
 typedef PSpinVector<PColorVector<RComplex<REAL64>, 3>, 4> DVec;
 typedef PScalar<PScalar<RScalar<REAL64> > >  DScal;
 
+} // namespace QDP;
+
+// the wrappers for the functions to be threaded
+
+#include "qdp_dispatch.h"
+#include "qdp_scalarsite_sse_blas_double_wrapper.h"
+
+namespace QDP {
 
 ////////////////////////////////
 // Threading evaluates
@@ -44,11 +52,6 @@ typedef PScalar<PScalar<RScalar<REAL64> > >  DScal;
 // by Xu Guo, EPCC, 6 October, 2008
 ////////////////////////////////
 
-// the wrappers for the functions to be threaded
-
-#include "qdp_dispatch.h"
-#include "qdp_scalarsite_sse_blas_double_wrapper.h"
-
 
 // #define DEBUG_BLAS
 // TVec is the LatticeFermion from qdp_dwdefs.h with the OLattice<> stripped
diff --git a/include/scalarsite_sse/qdp_scalarsite_sse_blas_double_wrapper.h b/include/scalarsite_sse/qdp_scalarsite_sse_blas_double_wrapper.h
index 658c32396..713784a6d 100644
--- a/include/scalarsite_sse/qdp_scalarsite_sse_blas_double_wrapper.h
+++ b/include/scalarsite_sse/qdp_scalarsite_sse_blas_double_wrapper.h
@@ -1,7 +1,7 @@
 #ifndef QDP_SCALARSITE_GENERIC_BLAS_DOUBLE_WRAPPER_H
 #define QDP_SCALARSITE_GENERIC_BLAS_DOUBLE_WRAPPER_H
 
-
+namespace QDP {
 
 ////////////////////////////////
 // Threading evaluates wrappers
@@ -33,7 +33,7 @@ void ordered_sse_vaxOpy4_double_evaluate_function (int lo, int hi, int myId, ord
   int n_4vec = hi - lo;
 
   int index = lo * 24;
-  
+
   InScale = &InScale[index];
   Out = &Out[index];
 
@@ -221,7 +221,7 @@ void unordered_sse_vscal4_double_evaluate_function (int lo, int hi, int myId, un
      int i=tab[j];
      REAL64 *xptr = (REAL64 *) &(x.elem(i).elem(0).elem(0).real());
      REAL64 *zptr =  &(d.elem(i).elem(0).elem(0).real());
- 
+
      vscal4(zptr, aptr, xptr, Ns);
   }
 
@@ -288,7 +288,7 @@ void ordered_sse_vaxOpbyz4_double_evaluate_function (int lo, int hi, int myId, o
   REAL64* bptr = a->bptr;
   REAL64* yptr = a->yptr;
   void (*func)(REAL64*, REAL64*, REAL64*, REAL64*, REAL64*, int) = a->func;
-  
+
   int n_4vec = hi - lo;
 
   int index = lo * 24;
@@ -339,16 +339,16 @@ void unordered_sse_vaxOpbyz4_double_evaluate_function (int lo, int hi, int myId,
 
   for(int j=lo; j < hi; j++) { 
       int i=tab[j];
-   
+
       REAL64 *xptr = (REAL64 *) &(x.elem(i).elem(0).elem(0).real());
       REAL64 *yptr = (REAL64 *) &(y.elem(i).elem(0).elem(0).real());
       REAL64* zptr =  &(d.elem(i).elem(0).elem(0).real());
-      
+
       func(zptr, aptr, xptr, bptr, yptr, Ns);
  }
 
 
-  
+
 }
 
 struct ordered_norm_double_user_arg {
@@ -390,6 +390,7 @@ inline void ordered_inner_product_double_func(int lo, int hi, int myId, ordered_
     func( &(a->results[2*myId]), xptr, yptr, nvec);
   }
 
+} // namespace QDP;
 
 
 #endif
diff --git a/include/scalarsite_sse/qdp_scalarsite_sse_blas_g5.h b/include/scalarsite_sse/qdp_scalarsite_sse_blas_g5.h
index c241798fd..7ba98fae9 100644
--- a/include/scalarsite_sse/qdp_scalarsite_sse_blas_g5.h
+++ b/include/scalarsite_sse/qdp_scalarsite_sse_blas_g5.h
@@ -12,8 +12,6 @@
 
 #include "scalarsite_sse/qdp_scalarsite_sse_blas_g5_includes.h"
 
-using namespace QDP;
-
 namespace QDP {
 
 // Types needed for the expression templates. 
diff --git a/include/scalarsite_sse/qdp_scalarsite_sse_blas_wrapper.h b/include/scalarsite_sse/qdp_scalarsite_sse_blas_wrapper.h
index 6914e1599..9961a5127 100644
--- a/include/scalarsite_sse/qdp_scalarsite_sse_blas_wrapper.h
+++ b/include/scalarsite_sse/qdp_scalarsite_sse_blas_wrapper.h
@@ -1,6 +1,8 @@
 #ifndef QDP_SCALARSITE_SSE_BLAS_WRAPPER_H
 #define QDP_SCALARSITE_SSE_BLAS_WRAPPER_H
 
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates wrappers
 //
@@ -86,7 +88,7 @@ void unordered_sse_vaxOpy3_y_evaluate_function (int lo, int hi, int myId, unorde
      func(yptr, scalep, yptr, xptr, 1);
    }
   }
- 
+
 
 }
 
@@ -230,7 +232,7 @@ void unordered_vOp_z_evaluate_function (int lo, int hi, int myId, unordered_sse_
       REAL32 *xptr = (REAL32 *) &(x.elem(i).elem(0).elem(0).real());
       REAL32 *yptr = (REAL32 *) &(y.elem(i).elem(0).elem(0).real());
       REAL32* zptr =  &(d.elem(i).elem(0).elem(0).real());
-            
+
       // Get the no of 3vecs. s.start() and s.end() are inclusive so add +1
       func(zptr, xptr, yptr, 1);
 
@@ -374,9 +376,9 @@ void unordered_sse_vaxOpby3_evaluate_function (int lo, int hi, int myId, unorder
       REAL32 *xptr = (REAL32 *) &(x.elem(i).elem(0).elem(0).real());
       REAL32 *yptr = (REAL32 *) &(y.elem(i).elem(0).elem(0).real());
       REAL32 * zptr =  &(d.elem(i).elem(0).elem(0).real());
-      
+
       func(zptr, aptr, xptr, bptr, yptr, 1);
-    
+
   }
 
 }
@@ -398,4 +400,6 @@ inline void ordered_norm_single_func(int lo, int hi, int myId, ordered_sse_norm_
     func( &(a->results[myId]), vptr, nvec);
   }
 
+} // namespace QDP;
+
 #endif
diff --git a/include/scalarsite_sse/qdp_scalarsite_sse_linalg.h b/include/scalarsite_sse/qdp_scalarsite_sse_linalg.h
index 931700600..9271ead52 100644
--- a/include/scalarsite_sse/qdp_scalarsite_sse_linalg.h
+++ b/include/scalarsite_sse/qdp_scalarsite_sse_linalg.h
@@ -13,20 +13,10 @@
 // These SSE asm instructions are only supported under GCC/G++
 #if defined(__GNUC__)
 
-namespace QDP {
-
-
 // #define QDP_SCALARSITE_DEBUG
-
 #define QDP_SCALARSITE_USE_EVALUATE
 
-
-/*! @defgroup optimizations  Optimizations
- *
- * Optimizations for basic QDP operations
- *
- * @{
- */
+namespace QDP {
 
 // Use this def just to safe some typing later on in the file
 typedef RComplex<REAL32>  RComplexFloat;
@@ -36,7 +26,7 @@ typedef PScalar<PColorMatrix<RComplexFloat, 3> > TCol;
 typedef PSpinVector<PColorVector<RComplex<REAL32>, 3>, 2> TVec2;
 typedef PSpinVector<PColorVector<RComplex<REAL32>, 3>, 4> TVec4;
 
-
+} // namespace QDP 
 
 #include "sse_mult_su3_nn.h"
 #include "sse_mult_su3_an.h"
@@ -48,6 +38,14 @@ typedef PSpinVector<PColorVector<RComplex<REAL32>, 3>, 4> TVec4;
 #include "sse_mult_su3_mat_hwvec.h"
 #include "sse_mult_adj_su3_mat_hwvec.h"
 
+namespace QDP {
+
+/*! @defgroup optimizations  Optimizations
+ *
+ * Optimizations for basic QDP operations
+ *
+ * @{
+ */
 
 // Optimized version of  
 //    PColorMatrix<RComplexFloat,3> <- PColorMatrix<RComplexFloat,3> * PColorMatrix<RComplexFloat,3>
diff --git a/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates.h b/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates.h
index b78441319..0b3a23671 100644
--- a/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates.h
+++ b/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates.h
@@ -3,11 +3,15 @@
 
 #include "sse_mult_adj_su3_mat_hwvec.h"
 
-/* Evaluates for things like adj(u)*spinProjectDir0Plus(y) */
-using namespace QDP;
 namespace QDP {
-
 typedef PScalar< PColorMatrix< RComplex<REAL32>, 3> > SU3Mat32;
+} // namespace QDP 
+
+// the wrappers for the functions to be threaded
+#include "qdp_sse_fused_spin_proj_evaluates_wrapper.h"
+
+/* Evaluates for things like adj(u)*spinProjectDir0Plus(y) */
+namespace QDP {
 
 
 ////////////////////////////////
@@ -16,9 +20,6 @@ typedef PScalar< PColorMatrix< RComplex<REAL32>, 3> > SU3Mat32;
 // by Xu Guo, EPCC, 20 October, 2008
 ////////////////////////////////
 
-// the wrappers for the functions to be threaded
-#include "qdp_sse_fused_spin_proj_evaluates_wrapper.h"
-
 
 // HalfVec = adj(u)*SpinProjectDir0Plus(Vec);
 template<>
diff --git a/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates_wrapper.h b/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates_wrapper.h
index 686f742e4..a4777ce32 100644
--- a/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates_wrapper.h
+++ b/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates_wrapper.h
@@ -1,6 +1,8 @@
 #ifndef QDP_SSE_FUSED_SPIN_PROJ_EVALUATES_WRAPPER_H
 #define QDP_SSE_FUSED_SPIN_PROJ_EVALUATES_WRAPPER_H
 
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates wrappers
 //
@@ -35,11 +37,11 @@ void ordered_sse_fused_spin_proj_evaluate_function (int lo, int hi, int myId, or
     func( (REAL32 *)&(a.elem(site).elem(0).elem(0).real()),
 		     (REAL32 *)&(tmp.elem(0).elem(0).real()),
 		     1);
-      
+
     su3_matrixf* um = (su3_matrixf *)&(u.elem(site).elem().elem(0,0).real());
     half_wilson_vectorf *tmph = (half_wilson_vectorf *)&( tmp.elem(0).elem(0).real());
     half_wilson_vectorf *dh = (half_wilson_vectorf *)&( d.elem(site).elem(0).elem(0).real());
-      
+
     intrin_sse_mult_adj_su3_mat_hwvec(um, tmph, dh);
   }
 
@@ -72,16 +74,16 @@ void unordered_sse_fused_spin_proj_evaluate_function (int lo, int hi, int myId,
     func( (REAL32 *)&(a.elem(site).elem(0).elem(0).real()),
 	  (REAL32 *)&(tmp.elem(0).elem(0).real()),
 	  1);
-      
+
     su3_matrixf* um = (su3_matrixf *)&(u.elem(site).elem().elem(0,0).real());
     half_wilson_vectorf *tmph = (half_wilson_vectorf *)&( tmp.elem(0).elem(0).real());
     half_wilson_vectorf *dh = (half_wilson_vectorf *)&( d.elem(site).elem(0).elem(0).real());
-      
+
     intrin_sse_mult_adj_su3_mat_hwvec(um, tmph, dh);
   }
 
 }
 
-
+} // namespace QDP 
 
 #endif
diff --git a/include/scalarsite_sse/qdp_sse_fused_spin_recon_evaluates_wrapper.h b/include/scalarsite_sse/qdp_sse_fused_spin_recon_evaluates_wrapper.h
index ce461bcac..1eef8e6d7 100644
--- a/include/scalarsite_sse/qdp_sse_fused_spin_recon_evaluates_wrapper.h
+++ b/include/scalarsite_sse/qdp_sse_fused_spin_recon_evaluates_wrapper.h
@@ -1,6 +1,8 @@
 #ifndef QDP_SSE_FUSED_SPIN_RECON_EVALUATES_WRAPPER_H
 #define QDP_SSE_FUSED_SPIN_RECON_EVALUATES_WRAPPER_H
 
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates wrappers
 //
@@ -39,8 +41,8 @@ void ordered_sse_fused_spin_recon_evaluate_function (int lo, int hi, int myId, o
       half_wilson_vectorf *tmph = (half_wilson_vectorf *)&( tmp.elem(0).elem(0).real());
 
       intrin_sse_mult_su3_mat_hwvec(um, ah, tmph);
-      
-      
+
+
       func( (REAL32 *)&(tmp.elem(0).elem(0).real()),
 	    (REAL32 *)&(d.elem(site).elem(0).elem(0).real()),
 	    1);
@@ -73,7 +75,7 @@ void unordered_sse_fused_spin_recon_evaluate_function (int lo, int hi, int myId,
   for (int j = lo; j < hi; j++){
 
     int site=tab[j];
-      
+
     HVec32 tmp ;
 
     su3_matrixf* um = (su3_matrixf *)&(u.elem(site).elem().elem(0,0).real());
@@ -81,8 +83,8 @@ void unordered_sse_fused_spin_recon_evaluate_function (int lo, int hi, int myId,
     half_wilson_vectorf *tmph = (half_wilson_vectorf *)&( tmp.elem(0).elem(0).real());
 
     intrin_sse_mult_su3_mat_hwvec(um, ah, tmph);
-      
-      
+
+
     func( (REAL32 *)&(tmp.elem(0).elem(0).real()),
 	  (REAL32 *)&(d.elem(site).elem(0).elem(0).real()),
 	  1);
@@ -90,5 +92,7 @@ void unordered_sse_fused_spin_recon_evaluate_function (int lo, int hi, int myId,
 
 }
 
+} // namespace QDP 
+
 
 #endif
diff --git a/include/scalarsite_sse/qdp_sse_spin_evaluates.h b/include/scalarsite_sse/qdp_sse_spin_evaluates.h
index 80fe29915..f21a5877c 100644
--- a/include/scalarsite_sse/qdp_sse_spin_evaluates.h
+++ b/include/scalarsite_sse/qdp_sse_spin_evaluates.h
@@ -1,7 +1,6 @@
 #ifndef QDP_SSE_SPIN_EVALUATES_H
 #define QDP_SSE_SPIN_EVALUATES_H
 
-using namespace QDP;
 namespace QDP {
 
 // Typedefs
@@ -15,16 +14,19 @@ typedef REAL32 SpinColFull[4][3][2];
 typedef REAL32 SpinColHalf[2][3][2];
 // d = SpinProjectDir0Plus(Vec);
 
+} // namespace QDP;
+
+// the wrappers for the functions to be threaded
+#include "qdp_sse_spin_evaluates_wrapper.h"
+
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates
 //
 // by Xu Guo, EPCC, 13 October, 2008
 ////////////////////////////////
 
-// the wrappers for the functions to be threaded
-#include "qdp_sse_spin_evaluates_wrapper.h"
-
-
 
 template<class A, class B>
 inline
diff --git a/include/scalarsite_sse/qdp_sse_spin_evaluates_wrapper.h b/include/scalarsite_sse/qdp_sse_spin_evaluates_wrapper.h
index 59a49a2b7..88e209b7a 100644
--- a/include/scalarsite_sse/qdp_sse_spin_evaluates_wrapper.h
+++ b/include/scalarsite_sse/qdp_sse_spin_evaluates_wrapper.h
@@ -1,6 +1,8 @@
 #ifndef QDP_SSE_SPIN_EVALUATES_WRAPPER_H
 #define QDP_SSE_SPIN_EVALUATES_WRAPPER_H
 
+namespace QDP {
+
 ////////////////////////////////
 // Threading evaluates wrappers
 //
@@ -65,7 +67,6 @@ void unordered_sse_spin_project_evaluate_function (int lo, int hi, int myId, uno
 
 }
 
-
-
+} // namespace QDP 
 
 #endif
diff --git a/include/scalarsite_sse/sse_fused_spin_proj.h b/include/scalarsite_sse/sse_fused_spin_proj.h
index e7138cacb..12cb1c740 100644
--- a/include/scalarsite_sse/sse_fused_spin_proj.h
+++ b/include/scalarsite_sse/sse_fused_spin_proj.h
@@ -5,7 +5,6 @@
 #include "sse_mult_adj_su3_mat_hwvec.h"
 
 #include <iostream>
-using namespace std;
 
 namespace QDP {
 

From 0788979355d1a31558ca1bb0fc7bf035df12281b Mon Sep 17 00:00:00 2001
From: Jonas Glesaaen <jonas@glesaaen.com>
Date: Mon, 14 May 2018 14:09:53 +0100
Subject: [PATCH 5/5] Moved includes outside of namespaces in scalarsite_bagel
 and scalarvecsite

---
 .../scalarsite_bagel_qdp/qdp_scalarsite_bagel_qdp_linalg.h  | 6 ++----
 include/scalarvecsite_sse/qdp_scalarvecsite_sse_blas.h      | 6 +++---
 include/scalarvecsite_sse/qdp_scalarvecsite_sse_linalg.h    | 6 +++---
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/include/scalarsite_bagel_qdp/qdp_scalarsite_bagel_qdp_linalg.h b/include/scalarsite_bagel_qdp/qdp_scalarsite_bagel_qdp_linalg.h
index c123896be..feedc124f 100644
--- a/include/scalarsite_bagel_qdp/qdp_scalarsite_bagel_qdp_linalg.h
+++ b/include/scalarsite_bagel_qdp/qdp_scalarsite_bagel_qdp_linalg.h
@@ -10,6 +10,8 @@
 #ifndef QDP_SCALARSITE_BAGEL_QDP_LINALG_H
 #define QDP_SCALARSITE_BAGEL_QDP_LINALG_H
 
+#include "bagel_qdp.h"
+
 namespace QDP {
 
 /*! @defgroup optimizations  Optimizations
@@ -21,10 +23,6 @@ namespace QDP {
 
 // Use this def just to safe some typing later on in the file
 
-
-
-#include "bagel_qdp.h"
-
 #if 1
 typedef RComplex<BAGELQDPFloat>  RComplexFloat;
 
diff --git a/include/scalarvecsite_sse/qdp_scalarvecsite_sse_blas.h b/include/scalarvecsite_sse/qdp_scalarvecsite_sse_blas.h
index 3eac91d6a..693565ee8 100755
--- a/include/scalarvecsite_sse/qdp_scalarvecsite_sse_blas.h
+++ b/include/scalarvecsite_sse/qdp_scalarvecsite_sse_blas.h
@@ -9,13 +9,13 @@
 #ifndef QDP_SCALARVECSITE_SSE_BLAS_H
 #define QDP_SCALARVECSITE_SSE_BLAS_H
 
+#define QDP_SCALARVECSITE_USE_EVALUATE
+#include "qdp_sse_intrin.h"
+
 namespace QDP {
 
 // #define QDP_SCALARVECSITE_BLAS_DEBUG
 
-#define QDP_SCALARVECSITE_USE_EVALUATE
-#include "qdp_sse_intrin.h"
-
 typedef PScalar<PColorMatrix<RComplex<ILattice<REAL32,4> >, 3> >       TCMat;
 typedef PScalar<PColorVector<RComplex<ILattice<REAL32,4> >, 3> >       TCVec;
 typedef PSpinVector<PColorVector<RComplex<ILattice<REAL32,4> >, 3>, 4> TDirac;
diff --git a/include/scalarvecsite_sse/qdp_scalarvecsite_sse_linalg.h b/include/scalarvecsite_sse/qdp_scalarvecsite_sse_linalg.h
index b71b961ac..ce7ea0eab 100755
--- a/include/scalarvecsite_sse/qdp_scalarvecsite_sse_linalg.h
+++ b/include/scalarvecsite_sse/qdp_scalarvecsite_sse_linalg.h
@@ -15,11 +15,13 @@
 
 #include "qdp_sse_intrin.h"
 
+#define QDP_SCALARVECSITE_USE_EVALUATE
+#include "scalarvecsite_sse/ssevec_mult_nn.h"
+
 namespace QDP {
 
 // #define QDP_SCALARVECSITE_DEBUG
 
-#define QDP_SCALARVECSITE_USE_EVALUATE
 
 
 
@@ -35,8 +37,6 @@ typedef IScalar<REAL32>                IScalarFloat;
 typedef ILattice<REAL32,4>             ILatticeFloat;
 typedef RComplex<ILattice<REAL32,4> >  RComplexFloat; 
 
-#include "scalarvecsite_sse/ssevec_mult_nn.h"
-
 //--------------------------------------------------------------------------------------
 // Optimized version of  
 // ILatticeFloat <- ILatticeFloat + ILatticeFloat