From f232ad401113ac788054003c829a7a98a666fc7b Mon Sep 17 00:00:00 2001 From: Jonas Glesaaen Date: Mon, 14 May 2018 14:00:39 +0100 Subject: [PATCH 1/5] Moved all std library includes out of namespaces This is necessary due to the fact that it breaks the C++ standard (see item 20.5.2.2.3). This can and will therefore sometime lead to compilation errors. --- include/qdp_sse_intrin.h | 3 ++- include/scalarsite_sse/sse_blas_local_sumsq_double.h | 4 +--- include/scalarsite_sse/sse_dcomplex_mult_macros.h | 5 ++++- include/scalarsite_sse/sse_spin_proj_inlines.h | 2 +- lib/scalarsite_sse/sse_blas_local_sumsq_double.cc | 4 +--- lib/scalarsite_sse/sse_blas_local_vcdot_double.cc | 8 +++++--- lib/scalarsite_sse/sse_blas_vaxmbyz4_double.cc | 5 ++--- lib/scalarsite_sse/sse_blas_vaxmyz4_double.cc | 4 +--- lib/scalarsite_sse/sse_blas_vaxpbyz4_double.cc | 5 ++--- lib/scalarsite_sse/sse_blas_vaxpy4_double.cc | 5 ++--- lib/scalarsite_sse/sse_blas_vaypx4_double.cc | 2 +- lib/scalarsite_sse/sse_blas_vscal4_double.cc | 3 +-- lib/scalarsite_sse/sse_linalg_m_eq_hh_double.cc | 10 ++++++---- lib/scalarsite_sse/sse_linalg_m_eq_hm_double.cc | 9 ++++++--- lib/scalarsite_sse/sse_linalg_m_eq_mh_double.cc | 9 ++++++--- lib/scalarsite_sse/sse_linalg_m_eq_mm_double.cc | 9 ++++++--- lib/scalarsite_sse/sse_linalg_m_eq_scal_m_double.cc | 2 +- lib/scalarsite_sse/sse_linalg_m_peq_m_double.cc | 2 +- 18 files changed, 49 insertions(+), 42 deletions(-) diff --git a/include/qdp_sse_intrin.h b/include/qdp_sse_intrin.h index 48f5ee5b9..9264fff4f 100644 --- a/include/qdp_sse_intrin.h +++ b/include/qdp_sse_intrin.h @@ -1,9 +1,10 @@ #ifndef QDP_SSE_INTRIN_H #define QDP_SSE_INTRIN_H +#include + // Include the file with the SSE intrinsics in it namespace QDP { -#include typedef __m128 v4sf; typedef union { diff --git a/include/scalarsite_sse/sse_blas_local_sumsq_double.h b/include/scalarsite_sse/sse_blas_local_sumsq_double.h index 8024d636a..e44affc0a 100644 --- a/include/scalarsite_sse/sse_blas_local_sumsq_double.h +++ b/include/scalarsite_sse/sse_blas_local_sumsq_double.h @@ -9,14 +9,12 @@ #define QDP_SSE_BLAS_LOCAL_SUMSQ_DOUBLE #include "qdp_precision.h" +#include namespace QDP { -#include - void local_sumsq4(REAL64 *sum, REAL64 *vecptr, int n_4spin); - } // namespace QDP; #endif // guard diff --git a/include/scalarsite_sse/sse_dcomplex_mult_macros.h b/include/scalarsite_sse/sse_dcomplex_mult_macros.h index 0cea2706c..208273269 100644 --- a/include/scalarsite_sse/sse_dcomplex_mult_macros.h +++ b/include/scalarsite_sse/sse_dcomplex_mult_macros.h @@ -7,6 +7,10 @@ /* SSE 2 Headers */ #include +#ifdef QDP_USE_SSE3 +#include +#endif + /* A useful union type allows me to set values into the vector from code */ @@ -122,7 +126,6 @@ #else #warning "Using SSE3" /* SSE 3 */ -#include /* z = x*y z, x, y are SSE registers containing complex numbers ordered with the real part in the low half, imag part diff --git a/include/scalarsite_sse/sse_spin_proj_inlines.h b/include/scalarsite_sse/sse_spin_proj_inlines.h index b76fc82e4..897ada32a 100644 --- a/include/scalarsite_sse/sse_spin_proj_inlines.h +++ b/include/scalarsite_sse/sse_spin_proj_inlines.h @@ -2,13 +2,13 @@ #define SSE_SPIN_PROJ_INLINES_H #include "qdp_sse_intrin.h" +#include /* File: generic_spin_proj_inlines.h Purpose: Supply inline functions to do spin projection Author: $Id: sse_spin_proj_inlines.h,v 1.6 2009-02-11 20:50:45 bjoo Exp $ */ namespace QDP { -#include /** \brief Spin Project (1/2)(1+\gamma_0) * diff --git a/lib/scalarsite_sse/sse_blas_local_sumsq_double.cc b/lib/scalarsite_sse/sse_blas_local_sumsq_double.cc index 2c611d700..8d73e3a6d 100644 --- a/lib/scalarsite_sse/sse_blas_local_sumsq_double.cc +++ b/lib/scalarsite_sse/sse_blas_local_sumsq_double.cc @@ -4,13 +4,11 @@ * */ +#include #include "scalarsite_sse/sse_blas_local_sumsq_double.h" namespace QDP { -#include - - // (Vector) out = (Scalar) (*scalep) * (Vector) InScale + (Vector) Add // #define DEBUG_VAXPY_DOUBLE void local_sumsq4(REAL64 *sum, REAL64 *vecptr, int n_4spin) diff --git a/lib/scalarsite_sse/sse_blas_local_vcdot_double.cc b/lib/scalarsite_sse/sse_blas_local_vcdot_double.cc index a2a77d2f8..22fba61a9 100644 --- a/lib/scalarsite_sse/sse_blas_local_vcdot_double.cc +++ b/lib/scalarsite_sse/sse_blas_local_vcdot_double.cc @@ -6,10 +6,13 @@ #include #include "scalarsite_sse/sse_blas_local_vcdot_double.h" +#include "qdp_config.h" -namespace QDP { +#ifdef QDP_USE_SSE3 +#include +#endif -#include "qdp_config.h" +namespace QDP { #ifndef QDP_USE_SSE3 @@ -45,7 +48,6 @@ namespace QDP { #else #warning Using SSE3 /* SSE 3 */ -#include #define CONJMUL(z,x,y) \ { \ diff --git a/lib/scalarsite_sse/sse_blas_vaxmbyz4_double.cc b/lib/scalarsite_sse/sse_blas_vaxmbyz4_double.cc index 6b4eabdce..80cec5051 100644 --- a/lib/scalarsite_sse/sse_blas_vaxmbyz4_double.cc +++ b/lib/scalarsite_sse/sse_blas_vaxmbyz4_double.cc @@ -4,13 +4,12 @@ * */ +#include #include "scalarsite_sse/sse_blas_vaxmbyz4_double.h" +#include "scalarsite_sse/sse_prefetch.h" namespace QDP { -#include -#include "scalarsite_sse/sse_prefetch.h" - #ifndef L2BY2 #define L2BY2 1365 /* L2 / 2 in SPINORS */ #endif diff --git a/lib/scalarsite_sse/sse_blas_vaxmyz4_double.cc b/lib/scalarsite_sse/sse_blas_vaxmyz4_double.cc index 7b69d3666..b43708a13 100644 --- a/lib/scalarsite_sse/sse_blas_vaxmyz4_double.cc +++ b/lib/scalarsite_sse/sse_blas_vaxmyz4_double.cc @@ -4,13 +4,11 @@ * */ +#include #include "scalarsite_sse/sse_blas_vaxmyz4_double.h" namespace QDP { -#include - - void vaxmyz4(REAL64 *Out,REAL64 *scalep,REAL64 *InScale, REAL64 *Add,int n_4vec) { __m128d scalar; diff --git a/lib/scalarsite_sse/sse_blas_vaxpbyz4_double.cc b/lib/scalarsite_sse/sse_blas_vaxpbyz4_double.cc index e48059d3d..81af4113e 100644 --- a/lib/scalarsite_sse/sse_blas_vaxpbyz4_double.cc +++ b/lib/scalarsite_sse/sse_blas_vaxpbyz4_double.cc @@ -4,13 +4,12 @@ * */ +#include #include "scalarsite_sse/sse_blas_vaxpbyz4_double.h" +#include "scalarsite_sse/sse_prefetch.h" namespace QDP { -#include -#include "scalarsite_sse/sse_prefetch.h" - #ifndef L2BY2 #define L2BY2 1365 /* L2 / 2 in SPINORS */ diff --git a/lib/scalarsite_sse/sse_blas_vaxpy4_double.cc b/lib/scalarsite_sse/sse_blas_vaxpy4_double.cc index bf8342e5b..03c365b01 100644 --- a/lib/scalarsite_sse/sse_blas_vaxpy4_double.cc +++ b/lib/scalarsite_sse/sse_blas_vaxpy4_double.cc @@ -4,14 +4,13 @@ * */ +#include #include "scalarsite_sse/sse_blas_vaxpy4_double.h" +#include "scalarsite_sse/sse_prefetch.h" namespace QDP { -#include -#include "scalarsite_sse/sse_prefetch.h" - #ifndef L2BY2 #define L2BY2 1365 /* L2 / 2 in SPINORS */ #endif diff --git a/lib/scalarsite_sse/sse_blas_vaypx4_double.cc b/lib/scalarsite_sse/sse_blas_vaypx4_double.cc index da8e14b28..dc4db5f5f 100644 --- a/lib/scalarsite_sse/sse_blas_vaypx4_double.cc +++ b/lib/scalarsite_sse/sse_blas_vaypx4_double.cc @@ -4,11 +4,11 @@ * */ +#include #include "scalarsite_sse/sse_blas_vaypx4_double.h" namespace QDP { -#include void vaypx4(REAL64 *Out,REAL64 *scalep,REAL64 *InScale, int n_4spin) { diff --git a/lib/scalarsite_sse/sse_blas_vscal4_double.cc b/lib/scalarsite_sse/sse_blas_vscal4_double.cc index a2399b87f..77fa018ac 100644 --- a/lib/scalarsite_sse/sse_blas_vscal4_double.cc +++ b/lib/scalarsite_sse/sse_blas_vscal4_double.cc @@ -4,12 +4,11 @@ * */ +#include #include "scalarsite_sse/sse_blas_vscal4_double.h" namespace QDP { -#include - void vscal4(REAL64 *z,REAL64 *a,REAL64 *x, int n_4spin) { diff --git a/lib/scalarsite_sse/sse_linalg_m_eq_hh_double.cc b/lib/scalarsite_sse/sse_linalg_m_eq_hh_double.cc index 0139147dd..998a3b8c9 100644 --- a/lib/scalarsite_sse/sse_linalg_m_eq_hh_double.cc +++ b/lib/scalarsite_sse/sse_linalg_m_eq_hh_double.cc @@ -4,18 +4,21 @@ * */ +#include #include "scalarsite_sse/sse_linalg_mm_su3_double.h" +#include "qdp_config.h" -namespace QDP { +#ifdef QDP_USE_SSE3 +#include +#endif -#include +namespace QDP { typedef union { __m128d v; double d[2]; } VD; -#include "qdp_config.h" #ifndef QDP_USE_SSE3 @@ -54,7 +57,6 @@ typedef union { #else #warning Using SSE3 /* SSE 3 */ -#include #define CCMUL(z,x,y) \ { \ __m128d t1; \ diff --git a/lib/scalarsite_sse/sse_linalg_m_eq_hm_double.cc b/lib/scalarsite_sse/sse_linalg_m_eq_hm_double.cc index 7f040aa0f..9307eb2bc 100644 --- a/lib/scalarsite_sse/sse_linalg_m_eq_hm_double.cc +++ b/lib/scalarsite_sse/sse_linalg_m_eq_hm_double.cc @@ -4,12 +4,16 @@ * */ +#include #include "scalarsite_sse/sse_linalg_mm_su3_double.h" +#include "qdp_config.h" + +#ifdef QDP_USE_SSE3 +#include +#endif namespace QDP { -#include -#include "qdp_config.h" #ifndef QDP_USE_SSE3 @@ -45,7 +49,6 @@ namespace QDP { #else #warning Using SSE3 /* SSE 3 */ -#include #define CONJMUL(z,x,y) \ { \ diff --git a/lib/scalarsite_sse/sse_linalg_m_eq_mh_double.cc b/lib/scalarsite_sse/sse_linalg_m_eq_mh_double.cc index a253f5be5..da845c7a8 100644 --- a/lib/scalarsite_sse/sse_linalg_m_eq_mh_double.cc +++ b/lib/scalarsite_sse/sse_linalg_m_eq_mh_double.cc @@ -4,12 +4,16 @@ * */ +#include #include "scalarsite_sse/sse_linalg_mm_su3_double.h" +#include "qdp_config.h" + +#ifdef QDP_USE_SSE3 +#include +#endif namespace QDP { -#include -#include "qdp_config.h" #ifndef QDP_USE_SSE3 @@ -45,7 +49,6 @@ namespace QDP { #else #warning Using SSE3 /* SSE 3 */ -#include #define CONJMUL(z,x,y) \ { \ diff --git a/lib/scalarsite_sse/sse_linalg_m_eq_mm_double.cc b/lib/scalarsite_sse/sse_linalg_m_eq_mm_double.cc index 6b0023ff8..f6a3d1b74 100644 --- a/lib/scalarsite_sse/sse_linalg_m_eq_mm_double.cc +++ b/lib/scalarsite_sse/sse_linalg_m_eq_mm_double.cc @@ -4,13 +4,17 @@ * */ +#include #include "scalarsite_sse/sse_linalg_mm_su3_double.h" +#include "qdp_config.h" + +#ifdef QDP_USE_SSE3 +#include +#endif namespace QDP { -#include -#include "qdp_config.h" #ifndef QDP_USE_SSE3 // c = x*y; @@ -46,7 +50,6 @@ namespace QDP { #else #warning Using SSE3 -#include // Use SSE3 #define CMUL(z,x,y) \ diff --git a/lib/scalarsite_sse/sse_linalg_m_eq_scal_m_double.cc b/lib/scalarsite_sse/sse_linalg_m_eq_scal_m_double.cc index 10efd2f96..5cfcfed20 100644 --- a/lib/scalarsite_sse/sse_linalg_m_eq_scal_m_double.cc +++ b/lib/scalarsite_sse/sse_linalg_m_eq_scal_m_double.cc @@ -4,11 +4,11 @@ * */ +#include #include "scalarsite_sse/sse_linalg_mm_su3_double.h" namespace QDP { -#include /* M = a*M a is scalar */ void ssed_m_eq_scal_m(REAL64* m2, REAL64* a, REAL64 *m1, int n_mat) diff --git a/lib/scalarsite_sse/sse_linalg_m_peq_m_double.cc b/lib/scalarsite_sse/sse_linalg_m_peq_m_double.cc index bbac06b74..eca330c0b 100644 --- a/lib/scalarsite_sse/sse_linalg_m_peq_m_double.cc +++ b/lib/scalarsite_sse/sse_linalg_m_peq_m_double.cc @@ -4,11 +4,11 @@ * */ +#include #include "scalarsite_sse/sse_linalg_mm_su3_double.h" namespace QDP { -#include typedef union { double c[2]; From e8c5017b2f081c8a26773022a2a0391e5473970d Mon Sep 17 00:00:00 2001 From: Jonas Glesaaen Date: Mon, 14 May 2018 14:06:57 +0100 Subject: [PATCH 2/5] Moved all includes outside of namespaces in include root One should in general not have includes inside of namespaces unless strictly necessary (which it isn't in this case). --- include/qdp_config.h | 4 ++++ include/qdp_params.h | 5 ++--- include/qdp_parscalar_specific.h | 4 +++- include/qdp_precision.h | 4 ++++ include/qdp_scalar_specific.h | 4 ++++ include/qdp_scalarsite_defs.h | 6 +++--- include/qdp_scalarvecsite_defs.h | 6 +++--- 7 files changed, 23 insertions(+), 10 deletions(-) diff --git a/include/qdp_config.h b/include/qdp_config.h index 23ccae6a5..64f412511 100644 --- a/include/qdp_config.h +++ b/include/qdp_config.h @@ -14,6 +14,8 @@ /* Include the stuff generated by autoconf */ #include "qdp_config_internal.h" +namespace QDP { + /* Prefix everything with QDP_ */ static const char* const QDP_PACKAGE(PACKAGE); static const char* const QDP_PACKAGE_BUGREPORT(PACKAGE_BUGREPORT); @@ -23,6 +25,8 @@ static const char* const QDP_PACKAGE_TARNAME(PACKAGE_TARNAME); static const char* const QDP_PACKAGE_VERSION(PACKAGE_VERSION); static const char* const QDP_VERSION(VERSION); +} // namespace QDP + /* Undef the unwanted */ #undef PACKAGE diff --git a/include/qdp_params.h b/include/qdp_params.h index d0b8042e3..c81e43bf6 100644 --- a/include/qdp_params.h +++ b/include/qdp_params.h @@ -7,8 +7,9 @@ #ifndef QDP_PARAMS_H #define QDP_PARAMS_H -namespace QDP { +#include +namespace QDP { /*! @defgroup params Fundamental parameters for QDP * @@ -19,8 +20,6 @@ namespace QDP { * @{ */ -#include - const int Nd = QDP_ND; const int Nc = QDP_NC; const int Ns = QDP_NS; diff --git a/include/qdp_parscalar_specific.h b/include/qdp_parscalar_specific.h index e5842f124..b345a7019 100644 --- a/include/qdp_parscalar_specific.h +++ b/include/qdp_parscalar_specific.h @@ -323,10 +323,12 @@ void evaluate_userfunc(int lo, int hi, int myId, user_arg *a) } } +} // namespace QDP + //! include the header file for dispatch #include "qdp_dispatch.h" - +namespace QDP { //----------------------------------------------------------------------------- //! OLattice Op Scalar(Expression(source)) under an Subset diff --git a/include/qdp_precision.h b/include/qdp_precision.h index d85fa0cb8..01ec77053 100644 --- a/include/qdp_precision.h +++ b/include/qdp_precision.h @@ -11,6 +11,8 @@ // Fix Definitions #include +namespace QDP { + // Fix default precision #if ! defined(BASE_PRECISION) #define BASE_PRECISION 32 @@ -35,6 +37,8 @@ typedef REAL64 DOUBLE; typedef REAL64 REAL; typedef REAL64 DOUBLE; +} // namespace QDP + #define INNER_LOG 1 #else diff --git a/include/qdp_scalar_specific.h b/include/qdp_scalar_specific.h index 372a90308..6568b58fd 100644 --- a/include/qdp_scalar_specific.h +++ b/include/qdp_scalar_specific.h @@ -139,9 +139,13 @@ void evaluate_userfunc(int lo, int hi, int myId, user_arg *a) } } +} // namespace QDP + //! include the header file for dispatch #include "qdp_dispatch.h" +namespace QDP { + //----------------------------------------------------------------------------- //! OLattice Op Scalar(Expression(source)) under an Subset /*! diff --git a/include/qdp_scalarsite_defs.h b/include/qdp_scalarsite_defs.h index 4c368d05e..0a3ace630 100644 --- a/include/qdp_scalarsite_defs.h +++ b/include/qdp_scalarsite_defs.h @@ -7,6 +7,9 @@ #ifndef QDP_SCALARSITE_DEFS_H #define QDP_SCALARSITE_DEFS_H +#include +#include "qdp_precision.h" + namespace QDP { /*! \addtogroup defs Type definitions @@ -18,9 +21,6 @@ namespace QDP { * @{ */ -#include -#include "qdp_precision.h" - //---------------------------------------------------------------------- //! Gamma matrices are conveniently defined for this Ns typedef GammaType Gamma; diff --git a/include/qdp_scalarvecsite_defs.h b/include/qdp_scalarvecsite_defs.h index 2f6b7e40c..460dc0c8a 100644 --- a/include/qdp_scalarvecsite_defs.h +++ b/include/qdp_scalarvecsite_defs.h @@ -7,6 +7,9 @@ #ifndef QDP_SCALARVECSITE_DEFS_H #define QDP_SCALARVECSITE_DEFS_H +#include +#include "qdp_precision.h" + namespace QDP { @@ -19,9 +22,6 @@ namespace QDP { * @{ */ -#include -#include "qdp_precision.h" - //---------------------------------------------------------------------- //! Gamma matrices are conveniently defined for this Ns typedef GammaType Gamma; From 408c9b43f2e99a4285524e0c722fdd787aac89d2 Mon Sep 17 00:00:00 2001 From: Jonas Glesaaen Date: Mon, 14 May 2018 14:08:40 +0100 Subject: [PATCH 3/5] Moved includes outside of namespaces in include/scalarsite_generic --- include/qdp_scalarsite_pabasm.h | 19 +++---- .../qdp_generic_fused_spin_proj_evaluates.h | 13 ++--- ...eneric_fused_spin_proj_evaluates_wrapper.h | 13 +++-- .../qdp_generic_fused_spin_recon_evaluates.h | 11 +--- ...neric_fused_spin_recon_evaluates_wrapper.h | 15 ++--- .../qdp_generic_spin_project_evaluates.h | 11 ++-- ...p_generic_spin_project_evaluates_wrapper.h | 6 +- .../qdp_scalarsite_generic_blas.h | 10 +++- .../qdp_scalarsite_generic_blas_g5.h | 12 ++-- .../qdp_scalarsite_generic_blas_g5_wrapper.h | 57 +++++++++---------- .../qdp_scalarsite_generic_blas_wrapper.h | 20 +++---- .../qdp_scalarsite_generic_cblas.h | 10 +++- .../qdp_scalarsite_generic_cblas_wrapper.h | 15 +++-- .../qdp_scalarsite_generic_linalg.h | 39 ++++++------- .../qdp_scalarsite_generic_linalg_wrapper.h | 12 ++-- 15 files changed, 136 insertions(+), 127 deletions(-) diff --git a/include/qdp_scalarsite_pabasm.h b/include/qdp_scalarsite_pabasm.h index 84143a32b..2cd9f4adf 100644 --- a/include/qdp_scalarsite_pabasm.h +++ b/include/qdp_scalarsite_pabasm.h @@ -11,6 +11,15 @@ #warning "Using PABASM Scalarsite" +#include "scalarsite_generic/generic_mult_nn.h" +#include "scalarsite_generic/generic_mult_na.h" +#include "scalarsite_generic/generic_mult_an.h" +#include "scalarsite_generic/generic_mult_aa.h" +#include "scalarsite_generic/generic_mat_vec.h" +#include "scalarsite_generic/generic_adj_mat_vec.h" +#include "scalarsite_generic/generic_addvec.h" + + namespace QDP { /*! @defgroup optimizations Optimizations @@ -23,16 +32,6 @@ namespace QDP { // Use this def just to safe some typing later on in the file #define RComplexFloat RComplex - -#include "scalarsite_generic/generic_mult_nn.h" -#include "scalarsite_generic/generic_mult_na.h" -#include "scalarsite_generic/generic_mult_an.h" -#include "scalarsite_generic/generic_mult_aa.h" -#include "scalarsite_generic/generic_mat_vec.h" -#include "scalarsite_generic/generic_adj_mat_vec.h" -#include "scalarsite_generic/generic_addvec.h" - - // #define QDP_SCALARSITE_DEBUG // Optimized version of diff --git a/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates.h b/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates.h index 04de0f05f..4c0dc785d 100644 --- a/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates.h +++ b/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates.h @@ -1,14 +1,15 @@ #ifndef QDP_GENERIC_FUSED_SPIN_PROJ_EVALUATES_H #define QDP_GENERIC_FUSED_SPIN_PROJ_EVALUATES_H - -/* Evaluates for things like adj(u)*spinProjectDir0Plus(y) */ -using namespace QDP; namespace QDP { - typedef PScalar< PColorMatrix< RComplex, 3> > SU3Mat; +} // namespace QDP; +// ther wrappers for the functions to be threaded +#include "qdp_generic_fused_spin_proj_evaluates_wrapper.h" +/* Evaluates for things like adj(u)*spinProjectDir0Plus(y) */ +namespace QDP { //////////////////////////////// // Threading evaluates @@ -16,10 +17,6 @@ typedef PScalar< PColorMatrix< RComplex, 3> > SU3Mat; // by Xu Guo, EPCC, 28 August, 2008 //////////////////////////////// -// ther wrappers for the functions to be threaded -#include "qdp_generic_fused_spin_proj_evaluates_wrapper.h" - - // HalfVec = adj(u)*SpinProjectDir0Plus(Vec); template<> inline diff --git a/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates_wrapper.h b/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates_wrapper.h index d128a2ee9..3d4a9ff95 100644 --- a/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates_wrapper.h +++ b/include/scalarsite_generic/qdp_generic_fused_spin_proj_evaluates_wrapper.h @@ -1,6 +1,8 @@ #ifndef QDP_GENERIC_FUSED_SPIN_PROJ_EVALUATES_WRAPPER_H #define QDP_GENERIC_FUSED_SPIN_PROJ_EVALUATES_WRAPPER_H +namespace QDP { + //////////////////////////////// // Threading evaluates wrappers // @@ -33,7 +35,7 @@ void ordered_fused_spin_proj_evaluate_function (int lo, int hi, int myId, ordere for (int site = low; site < high; site++){ HVec tmp; func( (REAL *)&(a.elem(site).elem(0).elem(0).real()),(REAL *)&(tmp.elem(0).elem(0).real()), 1); - + _inline_mult_adj_su3_mat_vec(u.elem(site).elem(), tmp.elem(0), d.elem(site).elem(0)); _inline_mult_adj_su3_mat_vec(u.elem(site).elem(), tmp.elem(1), d.elem(site).elem(1)); } @@ -63,18 +65,17 @@ void unordered_fused_spin_proj_evaluate_function (int lo, int hi, int myId, unor for (int j = lo; j < hi; j++){ int site = tab[j]; - + HVec tmp; func( (REAL *)&(a.elem(site).elem(0).elem(0).real()), (REAL *)&(tmp.elem(0).elem(0).real()), 1); - - + + _inline_mult_adj_su3_mat_vec(u.elem(site).elem(), tmp.elem(0), d.elem(site).elem(0)); _inline_mult_adj_su3_mat_vec(u.elem(site).elem(), tmp.elem(1), d.elem(site).elem(1)); } } - - +} // namespace QDP; #endif diff --git a/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates.h b/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates.h index 3859c696c..e333123bf 100644 --- a/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates.h +++ b/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates.h @@ -1,8 +1,10 @@ #ifndef QDP_GENERIC_FUSED_SPIN_RECON_EVALUATES_H #define QDP_GENERIC_FUSED_SPIN_RECON_EVALUATES_H -namespace QDP { +// the wrappers for the functions to be threaded +#include "qdp_generic_fused_spin_recon_evaluates_wrapper.h" +namespace QDP { //////////////////////////////// // Threading evaluates @@ -10,9 +12,6 @@ namespace QDP { // by Xu Guo, EPCC, 28 August, 2008 //////////////////////////////// -// the wrappers for the functions to be threaded -#include "qdp_generic_fused_spin_recon_evaluates_wrapper.h" - // Vec = SpinReconstructDir0Plus( u * psi); template<> @@ -1210,8 +1209,4 @@ void evaluate(OLattice< FVec >& d, } // namespace QDP; - - - - #endif diff --git a/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates_wrapper.h b/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates_wrapper.h index 241d48b6c..cda7d8fdf 100644 --- a/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates_wrapper.h +++ b/include/scalarsite_generic/qdp_generic_fused_spin_recon_evaluates_wrapper.h @@ -1,6 +1,8 @@ #ifndef QDP_GENERIC_FUSED_SPIN_RECON_EVALUATES_WRAPPER_H #define QDP_GENERIC_FUSED_SPIN_RECON_EVALUATES_WRAPPER_H +namespace QDP { + //////////////////////////////// // Threading evaluates wrappers // @@ -34,8 +36,8 @@ void ordered_fused_spin_recon_evaluate_function (int lo, int hi, int myId, order HVec tmp; _inline_mult_su3_mat_vec(u.elem(site).elem(), a.elem(site).elem(0), tmp.elem(0)); _inline_mult_su3_mat_vec(u.elem(site).elem(), a.elem(site).elem(1), tmp.elem(1)); - - + + func( (REAL *)&(tmp.elem(0).elem(0).real()), (REAL *)&(d.elem(site).elem(0).elem(0).real()), 1); } @@ -65,19 +67,18 @@ void unordered_fused_spin_recon_evaluate_function (int lo, int hi, int myId, uno for (int j = lo; j < hi; j++){ int site = tab[j]; - + HVec tmp; _inline_mult_su3_mat_vec(u.elem(site).elem(), a.elem(site).elem(0), tmp.elem(0)); _inline_mult_su3_mat_vec(u.elem(site).elem(), a.elem(site).elem(1), tmp.elem(1)); - - + + func( (REAL *)&(tmp.elem(0).elem(0).real()), (REAL *)&(d.elem(site).elem(0).elem(0).real()),1); } } - - +} // namespace QDP; #endif diff --git a/include/scalarsite_generic/qdp_generic_spin_project_evaluates.h b/include/scalarsite_generic/qdp_generic_spin_project_evaluates.h index 1904522e5..da6b3a89d 100644 --- a/include/scalarsite_generic/qdp_generic_spin_project_evaluates.h +++ b/include/scalarsite_generic/qdp_generic_spin_project_evaluates.h @@ -1,7 +1,6 @@ #ifndef QDP_GENERIC_SPIN_PROJECT_EVALUTATES_H #define QDP_GENERIC_SPIN_PROJECT_EVALUTATES_H -using namespace QDP; namespace QDP { // Typedefs @@ -15,15 +14,19 @@ typedef REAL SpinColFull[4][3][2]; typedef REAL SpinColHalf[2][3][2]; // d = SpinProjectDir0Plus(Vec); +} // namespace QDP; + +// the wrappers for the functions to be threaded +#include "qdp_generic_spin_project_evaluates_wrapper.h" + +namespace QDP { + //////////////////////////////// // Threading evaluates // // by Xu Guo, EPCC, 28 August, 2008 //////////////////////////////// -// the wrappers for the functions to be threaded -#include "qdp_generic_spin_project_evaluates_wrapper.h" - template inline diff --git a/include/scalarsite_generic/qdp_generic_spin_project_evaluates_wrapper.h b/include/scalarsite_generic/qdp_generic_spin_project_evaluates_wrapper.h index 73c9b3335..10d4b0ef7 100644 --- a/include/scalarsite_generic/qdp_generic_spin_project_evaluates_wrapper.h +++ b/include/scalarsite_generic/qdp_generic_spin_project_evaluates_wrapper.h @@ -1,6 +1,7 @@ #ifndef QDP_GENERIC_SPIN_PROJECT_EVALUATES_WRAPPER_H #define QDP_GENERIC_SPIN_PROJECT_EVALUATES_WRAPPER_H +namespace QDP { //////////////////////////////// // Threading evaluates wrappers @@ -64,9 +65,6 @@ void unordered_spin_project_evaluate_function (int lo, int hi, int myId, unorder } - - - - +} // namespace QDP #endif diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_blas.h b/include/scalarsite_generic/qdp_scalarsite_generic_blas.h index bbb99cbe9..26fb3035e 100644 --- a/include/scalarsite_generic/qdp_scalarsite_generic_blas.h +++ b/include/scalarsite_generic/qdp_scalarsite_generic_blas.h @@ -28,15 +28,19 @@ namespace QDP { typedef PSpinVector, 3>, 4> TVec; typedef PScalar > > TScal; +} // namespace QDP; + +// the wrappers for the functions to be threaded +#include "qdp_scalarsite_generic_blas_wrapper.h" + +namespace QDP { + //////////////////////////////// // Threading evaluates // // by Xu Guo, EPCC, 12 August, 2008 //////////////////////////////// -// the wrappers for the functions to be threaded -#include "qdp_scalarsite_generic_blas_wrapper.h" - // #define DEBUG_BLAS // TVec is the LatticeFermion from qdp_dwdefs.h with the OLattice<> stripped diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5.h b/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5.h index 7e5389e8f..9d43ce18c 100644 --- a/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5.h +++ b/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5.h @@ -17,8 +17,6 @@ #include "scalarsite_generic/generic_blas_vaxpby3_g5.h" #include "scalarsite_generic/generic_blas_g5.h" -using namespace QDP; - namespace QDP { // Types needed for the expression templates. @@ -26,15 +24,19 @@ namespace QDP { typedef PSpinVector, 3>, Ns> TVec; typedef PScalar > > TScal; +} // namespace QDP + +// the wrappers for the functions to be threaded +#include "qdp_scalarsite_generic_blas_g5_wrapper.h" + +namespace QDP { + //////////////////////////////// // Threading evaluates // // by Xu Guo, EPCC, 26 August, 2008 //////////////////////////////// -// the wrappers for the functions to be threaded -#include "qdp_scalarsite_generic_blas_g5_wrapper.h" - // #define DEBUG_BLAS_G6 // TVec is the LatticeFermion from qdp_dwdefs.h with the OLattice<> stripped diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5_wrapper.h b/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5_wrapper.h index fd2e452fe..db677db2c 100644 --- a/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5_wrapper.h +++ b/include/scalarsite_generic/qdp_scalarsite_generic_blas_g5_wrapper.h @@ -1,6 +1,8 @@ #ifndef QDP_SCALARSITE_GENERIC_BLAS_G5_WRAPPER_H #define QDP_SCALARSITE_GENERIC_BLAS_G5_WRAPPER_H +namespace QDP { + //////////////////////////////// // Threading evaluates wrappers // @@ -70,7 +72,7 @@ void unordered_vaypx3_g5_y_evaluate_function (int lo, int hi, int myId, unordere int Ns = a->Ns; const int* tab = a->tab; void (*func)(REAL*, REAL*, REAL*, REAL*, int) = a->func; - + for(int j=lo; j < hi; j++) { int i=tab[j]; REAL* xptr = (REAL *)&(x.elem(i).elem(0).elem(0).real()); @@ -78,7 +80,7 @@ void unordered_vaypx3_g5_y_evaluate_function (int lo, int hi, int myId, unordere func(yptr, aptr, yptr, xptr, Ns); } } - + // structure for vaypx3_g5 of NOT having order (with z ) struct unordered_vaypx3_g5_z_user_arg{ @@ -90,7 +92,7 @@ struct unordered_vaypx3_g5_z_user_arg{ int Ns_, const int* tab_, void (*func_)(REAL*, REAL*, REAL*, REAL*, int)) : x(x_),y(y_),d(d_),aptr(aptr_), Ns(Ns_), tab(tab_),func(func_) {} - + const OLattice< TVec >& x; const OLattice< TVec >& y; OLattice< TVec >& d; @@ -111,14 +113,14 @@ void unordered_vaypx3_g5_z_evaluate_function (int lo, int hi, int myId, unordere int Ns = a->Ns; const int* tab = a->tab; void (*func)(REAL*, REAL*, REAL*, REAL*, int) = a->func; - + for(int j=lo; j < hi; j++) { int i=tab[j]; REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real()); REAL* zptr = &(d.elem(i).elem(0).elem(0).real()); - + // Get the no of 3vecs. s.start() and s.end() are inclusive so add +1 func(zptr, aptr, xptr, yptr, Ns); } @@ -181,7 +183,7 @@ void unordered_vadd3_g5_evaluate_function (int lo, int hi, int myId, unordered_v int Ns = a->Ns; const int* tab = a->tab; void (*func)(REAL*, REAL*, REAL*, int) = a->func; - + for(int j=lo; j < hi; j++) { int i=tab[j]; REAL* xptr = (REAL *)&(x.elem(i).elem(0).elem(0).real()); @@ -236,7 +238,7 @@ struct unordered_vaxpy3_g5_user_arg{ int Ns_, const int* tab_, void (*func_)(REAL*, REAL*, REAL*, REAL*, int)) : x(x_), y(y_),d(d_),aptr(aptr_),Ns(Ns_), tab(tab_), func(func_) {} - + const OLattice< TVec >& x; const OLattice< TVec >& y; OLattice< TVec >& d; @@ -257,13 +259,13 @@ void unordered_vaxpy3_g5_evaluate_function (int lo, int hi, int myId, unordered_ int Ns = a->Ns; const int* tab = a->tab; void (*func)(REAL*, REAL*, REAL*, REAL*, int) = a->func; - + for(int j=lo; j < hi; j++) { int i=tab[j]; REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real()); REAL* zptr = &(d.elem(i).elem(0).elem(0).real()); - + func(zptr, aptr, xptr, yptr, Ns); } } @@ -297,7 +299,7 @@ void ordered_vscal_g5_evaluate_function (int lo, int hi, int myId, ordered_vscal int index = lo * 24; Out = &Out[index]; In = &In[index]; - + func(Out, scalep, In, n_4vec); } @@ -330,12 +332,12 @@ void unordered_vscal_g5_evaluate_function (int lo, int hi, int myId, unordered_v int Ns = a->Ns; const int* tab = a->tab; void (*func)(REAL*, REAL*, REAL*, int) = a->func; - + for(int j=lo; j < hi; j++) { int i=tab[j]; REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL* zptr = &(d.elem(i).elem(0).elem(0).real()); - + func(zptr, aptr, xptr, Ns); } } @@ -414,13 +416,13 @@ void unordered_vaxpby3_g5_evaluate_function (int lo, int hi, int myId, unordered int Ns = a->Ns; const int* tab = a->tab; void (*func)(REAL*, REAL*, REAL*, REAL*, REAL*, int) = a->func; - + for(int j=lo; j < hi; j++) { int i=tab[j]; REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real()); REAL* zptr = &(d.elem(i).elem(0).elem(0).real()); - + func(zptr, aptr, xptr, bptr, yptr, Ns); } } @@ -453,7 +455,7 @@ void ordered_scal_g5_evaluate_function (int lo, int hi, int myId, ordered_scal_g int index = lo * 24; Out = &Out[index]; In = &In[index]; - + scal_g5(Out, scalep, In, n_4vec); } @@ -482,12 +484,12 @@ void unordered_scal_g5_evaluate_function (int lo, int hi, int myId, unordered_sc REAL* aptr = a->aptr; int Ns = a->Ns; const int* tab = a->tab; - + for(int j=lo; j < hi; j++) { int i=tab[j]; REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL* zptr = &(d.elem(i).elem(0).elem(0).real()); - + scal_g5(zptr, aptr, xptr, Ns); } } @@ -562,13 +564,13 @@ void unordered_xOpayz_g5_evaluate_function (int lo, int hi, int myId, unordered_ int Ns = a->Ns; const int* tab = a->tab; void (*func)(REAL*, REAL*, REAL*, REAL*, int) = a->func; - + for(int j=lo; j < hi; j++) { int i=tab[j]; REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real()); REAL* zptr = &(d.elem(i).elem(0).elem(0).real()); - + func(zptr, aptr, xptr, yptr, Ns); } } @@ -645,13 +647,13 @@ void unordered_axOpbyz_g5_evaluate_function (int lo, int hi, int myId, unordered int Ns = a->Ns; const int* tab = a->tab; void (*func)(REAL*, REAL*, REAL*, REAL*, REAL*, int) = a->func; - + for(int j=lo; j < hi; j++) { int i=tab[j]; REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real()); REAL* zptr = &(d.elem(i).elem(0).elem(0).real()); - + func(zptr, aptr, xptr, bptr, yptr, Ns); } } @@ -724,13 +726,13 @@ void unordered_xOpayz_ig5_y_evaluate_function (int lo, int hi, int myId, unorder int Ns = a->Ns; const int* tab = a->tab; void (*func)(REAL*, REAL*, REAL*, REAL*, int) = a->func; - + for(int j=lo; j < hi; j++) { int i=tab[j]; REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real()); REAL* zptr = &(d.elem(i).elem(0).elem(0).real()); - + func(zptr, aptr, xptr, yptr, Ns); } } @@ -765,19 +767,16 @@ void unordered_xOpayz_ig5_z_evaluate_function (int lo, int hi, int myId, unorder int Ns = a->Ns; const int* tab = a->tab; void (*func)(REAL*, REAL*, REAL*, REAL*, int) = a->func; - + for(int j=lo; j < hi; j++) { int i=tab[j]; REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL* zptr = &(d.elem(i).elem(0).elem(0).real()); - + func(zptr, aptr, zptr, xptr, Ns); } } - - - - +} // namespace QDP; #endif diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_blas_wrapper.h b/include/scalarsite_generic/qdp_scalarsite_generic_blas_wrapper.h index acfa37c46..6aecf2dd1 100644 --- a/include/scalarsite_generic/qdp_scalarsite_generic_blas_wrapper.h +++ b/include/scalarsite_generic/qdp_scalarsite_generic_blas_wrapper.h @@ -1,6 +1,8 @@ #ifndef QDP_SCALARSITE_GENERIC_BLAS_WRAPPER_H #define QDP_SCALARSITE_GENERIC_BLAS_WRAPPER_H +namespace QDP { + //////////////////////////////// // Threading evaluates wrappers // @@ -79,7 +81,7 @@ void unordered_vaxpy3_y_evaluate_function (int lo, int hi, int myId, unordered_v vaxpy3(yptr, scalep, yptr, xptr, 1); } } - + } @@ -246,7 +248,7 @@ void ordered_vscal_evaluate_function (int lo, int hi, int myId, ordered_vscal_us // structure for vscal of NOT having order struct unordered_vscal_user_arg { unordered_vscal_user_arg( - + const OLattice< TVec >& x_, OLattice< TVec >& d_, REAL* scalep_, @@ -341,10 +343,10 @@ void unordered_vaxpby3_evaluate_function (int lo, int hi, int myId, unordered_va REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real()); REAL* zptr = &(d.elem(i).elem(0).elem(0).real()); - + // Get the no of 3vecs. s.start() and s.end() are inclusive so add +1 vaxpby3(zptr, aptr, xptr, bptr, yptr, 1); - + } } @@ -415,18 +417,14 @@ void unordered_vaxmby3_evaluate_function (int lo, int hi, int myId, unordered_va REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real()); REAL* zptr = &(d.elem(i).elem(0).elem(0).real()); - + // Get the no of 3vecs. s.start() and s.end() are inclusive so add +1 vaxmby3(zptr, aptr, xptr, bptr, yptr,1); - + } } - - - - - +} // namespace QDP #endif diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_cblas.h b/include/scalarsite_generic/qdp_scalarsite_generic_cblas.h index 9b659af8c..66b7ec4b9 100644 --- a/include/scalarsite_generic/qdp_scalarsite_generic_cblas.h +++ b/include/scalarsite_generic/qdp_scalarsite_generic_cblas.h @@ -14,15 +14,19 @@ namespace QDP { typedef PScalar > > CScal; typedef PSpinVector, 3>, 4> CTVec; +} // namespace QDP; + +// the wrappers for the functions to be threaded +#include "qdp_scalarsite_generic_cblas_wrapper.h" + +namespace QDP { + //////////////////////////////// // Threading evaluates // // by Xu Guo, EPCC, 26 August, 2008 //////////////////////////////// -// the wrappers for the functions to be threaded -#include "qdp_scalarsite_generic_cblas_wrapper.h" - // vector z *= complex a template<> inline diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_cblas_wrapper.h b/include/scalarsite_generic/qdp_scalarsite_generic_cblas_wrapper.h index 07aec4ab6..aebe89702 100644 --- a/include/scalarsite_generic/qdp_scalarsite_generic_cblas_wrapper.h +++ b/include/scalarsite_generic/qdp_scalarsite_generic_cblas_wrapper.h @@ -1,6 +1,8 @@ #ifndef QDP_SCALARSITE_GENERIC_CBLAS_WRAPPER_H #define QDP_SCALARSITE_GENERIC_CBLAS_WRAPPER_H +namespace QDP { + //////////////////////////////// // Threading evaluates wrappers // @@ -120,12 +122,12 @@ void unordered_vcaxpy3_y_evaluate_function (int lo, int hi, int myId, unordered_ for(int j=lo; j < hi; j++) { int i=tab[j]; - + REAL* xptr = (REAL *)&(x.elem(i).elem(0).elem(0).real()); REAL* yptr = &(d.elem(i).elem(0).elem(0).real()); vcaxpy3(yptr, scalep, xptr, yptr, 1); - + } } @@ -157,7 +159,7 @@ void unordered_vcaxpy3_z_evaluate_function (int lo, int hi, int myId, unordered_ for(int j=lo; j < hi; j++) { int i=tab[j]; - + REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real()); REAL *zptr = &(d.elem(i).elem(0).elem(0).real()); @@ -228,7 +230,7 @@ void unordered_vcaxmy3_evaluate_function (int lo, int hi, int myId, unordered_vc for(int j=lo; j < hi; j++) { int i=tab[j]; - + REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real()); REAL *zptr = &(d.elem(i).elem(0).elem(0).real()); @@ -302,7 +304,7 @@ void unordered_vcaxpby3_evaluate_function (int lo, int hi, int myId, unordered_v for(int j=lo; j < hi; j++) { int i=tab[j]; - + REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real()); REAL *zptr = &(d.elem(i).elem(0).elem(0).real()); @@ -376,7 +378,7 @@ void unordered_vcaxmby3_evaluate_function (int lo, int hi, int myId, unordered_v for(int j=lo; j < hi; j++) { int i=tab[j]; - + REAL *xptr = (REAL *) &(x.elem(i).elem(0).elem(0).real()); REAL *yptr = (REAL *) &(y.elem(i).elem(0).elem(0).real()); REAL *zptr = &(d.elem(i).elem(0).elem(0).real()); @@ -387,6 +389,7 @@ void unordered_vcaxmby3_evaluate_function (int lo, int hi, int myId, unordered_v } +} // namespace QDP #endif diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_linalg.h b/include/scalarsite_generic/qdp_scalarsite_generic_linalg.h index 36256ccb9..e1dbd685a 100644 --- a/include/scalarsite_generic/qdp_scalarsite_generic_linalg.h +++ b/include/scalarsite_generic/qdp_scalarsite_generic_linalg.h @@ -10,6 +10,14 @@ #ifndef QDP_SCALARSITE_GENERIC_LINALG_H #define QDP_SCALARSITE_GENERIC_LINALG_H +#include "scalarsite_generic/generic_mult_nn.h" +#include "scalarsite_generic/generic_mult_na.h" +#include "scalarsite_generic/generic_mult_an.h" +#include "scalarsite_generic/generic_mult_aa.h" +#include "scalarsite_generic/generic_mat_vec.h" +// #include "scalarsite_generic/generic_adj_mat_vec.h" -- No longer used." +#include "scalarsite_generic/generic_addvec.h" + namespace QDP { /*! @defgroup optimizations Optimizations @@ -22,16 +30,6 @@ namespace QDP { // Use this def just to safe some typing later on in the file typedef RComplex RComplexFloat; - -#include "scalarsite_generic/generic_mult_nn.h" -#include "scalarsite_generic/generic_mult_na.h" -#include "scalarsite_generic/generic_mult_an.h" -#include "scalarsite_generic/generic_mult_aa.h" -#include "scalarsite_generic/generic_mat_vec.h" -// #include "scalarsite_generic/generic_adj_mat_vec.h" -- No longer used." -#include "scalarsite_generic/generic_addvec.h" - - // #define QDP_SCALARSITE_DEBUG // Optimized version of @@ -452,6 +450,7 @@ operator+(const PScalar >& l, return d; } +} // namespace QDP; #if 1 @@ -464,6 +463,8 @@ operator+(const PScalar >& l, // the wrappers for the function to be threaded #include "qdp_scalarsite_generic_linalg_wrapper.h" +namespace QDP { + // Specialization to optimize the case // LatticeHalfFermion = LatticeColorMatrix * LatticeHalfFermion // NOTE: let this be a subroutine to save space @@ -493,13 +494,13 @@ void evaluate(OLattice, 2> >& d, const H& r = static_cast(rhs.expression().right()); if( s.hasOrderedRep() ) { - + int totalSize = s.end() - s.start() + 1; - + int base = s.start(); - + ordered_linalg_user_arg a(d, l, r, base); - + dispatch_to_threads(totalSize, a, ordered_linalg_evaluate_userfunc); //////////////////// @@ -507,7 +508,7 @@ void evaluate(OLattice, 2> >& d, //////////////////// // Ordered Way - loop through sites and save a table lookup //for(int i=s.start(); i <= s.end(); i++) { - + //_inline_generic_mult_su3_mat_vec(l.elem(i).elem(), // r.elem(i).elem(0), // d.elem(i).elem(0)); @@ -526,14 +527,14 @@ void evaluate(OLattice, 2> >& d, unordered_linalg_user_arg arg(d, l, r, tab); dispatch_to_threads(totalSize, arg, unordered_linalg_evaluate_userfunc); - + //////////////////// // Original code //////////////////// // Unordered Way - do a site table lookup //for(int j=0; j < s.numSiteTable(); j++) { //int i = tab[j]; - + //_inline_generic_mult_su3_mat_vec(l.elem(i).elem(), // r.elem(i).elem(0), // d.elem(i).elem(0)); @@ -544,6 +545,8 @@ void evaluate(OLattice, 2> >& d, } } +} // namespace QDP; + #endif /*! @} */ // end of group optimizations @@ -552,6 +555,4 @@ void evaluate(OLattice, 2> >& d, #undef QDP_SCALARSITE_DEBUG #endif -} // namespace QDP; - #endif diff --git a/include/scalarsite_generic/qdp_scalarsite_generic_linalg_wrapper.h b/include/scalarsite_generic/qdp_scalarsite_generic_linalg_wrapper.h index c1da599ed..a9ae35906 100644 --- a/include/scalarsite_generic/qdp_scalarsite_generic_linalg_wrapper.h +++ b/include/scalarsite_generic/qdp_scalarsite_generic_linalg_wrapper.h @@ -1,6 +1,8 @@ #ifndef QDP_SCALARSITE_GENERIC_LINALG_WRAPPER_H #define QDP_SCALARSITE_GENERIC_LINALG_WRAPPER_H +namespace QDP { + //////////////////////////////// // Threading evaluates wrappers // @@ -24,7 +26,7 @@ struct ordered_linalg_user_arg{ const C& l_, const H& r_, int base_) : d(d_), l(l_),r(r_), base(base_) {} - + }; //! user function for the evaluate function in the ordered situation @@ -43,7 +45,7 @@ void ordered_linalg_evaluate_userfunc(int lo, int hi, int myId, ordered_linalg_u // Ordered Way - loop through sites and save a table lookup for(int i=low; i < high; i++) { - + _inline_generic_mult_su3_mat_vec(l.elem(i).elem(), r.elem(i).elem(0), d.elem(i).elem(0)); @@ -51,7 +53,7 @@ void ordered_linalg_evaluate_userfunc(int lo, int hi, int myId, ordered_linalg_u r.elem(i).elem(1), d.elem(i).elem(1)); } - + } //! user argument for the evaluate function in the unordered situation @@ -86,7 +88,7 @@ void unordered_linalg_evaluate_userfunc(int lo, int hi, int myId, unordered_lin // Unordered Way - do a site table lookup for(int j=lo; j < hi; j++) { int i = tab[j]; - + _inline_generic_mult_su3_mat_vec(l.elem(i).elem(), r.elem(i).elem(0), d.elem(i).elem(0)); @@ -96,5 +98,7 @@ void unordered_linalg_evaluate_userfunc(int lo, int hi, int myId, unordered_lin } } +} // namespace QDP; + #endif From 7ca5ebb3cc3cede0100edf1763769f791727d9d8 Mon Sep 17 00:00:00 2001 From: Jonas Glesaaen Date: Mon, 14 May 2018 14:09:09 +0100 Subject: [PATCH 4/5] Moves includes outside of headers in include/scalarsite_generic --- .../scalarsite_sse/qdp_scalarsite_sse_blas.h | 8 +++++-- .../qdp_scalarsite_sse_blas_double.h | 13 ++++++----- .../qdp_scalarsite_sse_blas_double_wrapper.h | 15 +++++++------ .../qdp_scalarsite_sse_blas_g5.h | 2 -- .../qdp_scalarsite_sse_blas_wrapper.h | 12 ++++++---- .../qdp_scalarsite_sse_linalg.h | 22 +++++++++---------- .../qdp_sse_fused_spin_proj_evaluates.h | 13 ++++++----- ...dp_sse_fused_spin_proj_evaluates_wrapper.h | 12 +++++----- ...p_sse_fused_spin_recon_evaluates_wrapper.h | 14 +++++++----- .../scalarsite_sse/qdp_sse_spin_evaluates.h | 12 +++++----- .../qdp_sse_spin_evaluates_wrapper.h | 5 +++-- include/scalarsite_sse/sse_fused_spin_proj.h | 1 - 12 files changed, 73 insertions(+), 56 deletions(-) diff --git a/include/scalarsite_sse/qdp_scalarsite_sse_blas.h b/include/scalarsite_sse/qdp_scalarsite_sse_blas.h index 5668c48ac..ccee33e52 100644 --- a/include/scalarsite_sse/qdp_scalarsite_sse_blas.h +++ b/include/scalarsite_sse/qdp_scalarsite_sse_blas.h @@ -34,14 +34,18 @@ namespace QDP { typedef PSpinVector, 3>, 4> TVec; typedef PScalar > > TScal; +} // namespace QDP; + +// the wrappers for the functions to be threaded +#include "qdp_scalarsite_sse_blas_wrapper.h" + +namespace QDP { //////////////////////////////// // Threading evaluates // // by Xu Guo, EPCC, 6 October, 2008 //////////////////////////////// - // the wrappers for the functions to be threaded -#include "qdp_scalarsite_sse_blas_wrapper.h" diff --git a/include/scalarsite_sse/qdp_scalarsite_sse_blas_double.h b/include/scalarsite_sse/qdp_scalarsite_sse_blas_double.h index 97baf9524..f8a1198f2 100644 --- a/include/scalarsite_sse/qdp_scalarsite_sse_blas_double.h +++ b/include/scalarsite_sse/qdp_scalarsite_sse_blas_double.h @@ -37,6 +37,14 @@ namespace QDP { typedef PSpinVector, 3>, 4> DVec; typedef PScalar > > DScal; +} // namespace QDP; + +// the wrappers for the functions to be threaded + +#include "qdp_dispatch.h" +#include "qdp_scalarsite_sse_blas_double_wrapper.h" + +namespace QDP { //////////////////////////////// // Threading evaluates @@ -44,11 +52,6 @@ typedef PScalar > > DScal; // by Xu Guo, EPCC, 6 October, 2008 //////////////////////////////// -// the wrappers for the functions to be threaded - -#include "qdp_dispatch.h" -#include "qdp_scalarsite_sse_blas_double_wrapper.h" - // #define DEBUG_BLAS // TVec is the LatticeFermion from qdp_dwdefs.h with the OLattice<> stripped diff --git a/include/scalarsite_sse/qdp_scalarsite_sse_blas_double_wrapper.h b/include/scalarsite_sse/qdp_scalarsite_sse_blas_double_wrapper.h index 658c32396..713784a6d 100644 --- a/include/scalarsite_sse/qdp_scalarsite_sse_blas_double_wrapper.h +++ b/include/scalarsite_sse/qdp_scalarsite_sse_blas_double_wrapper.h @@ -1,7 +1,7 @@ #ifndef QDP_SCALARSITE_GENERIC_BLAS_DOUBLE_WRAPPER_H #define QDP_SCALARSITE_GENERIC_BLAS_DOUBLE_WRAPPER_H - +namespace QDP { //////////////////////////////// // Threading evaluates wrappers @@ -33,7 +33,7 @@ void ordered_sse_vaxOpy4_double_evaluate_function (int lo, int hi, int myId, ord int n_4vec = hi - lo; int index = lo * 24; - + InScale = &InScale[index]; Out = &Out[index]; @@ -221,7 +221,7 @@ void unordered_sse_vscal4_double_evaluate_function (int lo, int hi, int myId, un int i=tab[j]; REAL64 *xptr = (REAL64 *) &(x.elem(i).elem(0).elem(0).real()); REAL64 *zptr = &(d.elem(i).elem(0).elem(0).real()); - + vscal4(zptr, aptr, xptr, Ns); } @@ -288,7 +288,7 @@ void ordered_sse_vaxOpbyz4_double_evaluate_function (int lo, int hi, int myId, o REAL64* bptr = a->bptr; REAL64* yptr = a->yptr; void (*func)(REAL64*, REAL64*, REAL64*, REAL64*, REAL64*, int) = a->func; - + int n_4vec = hi - lo; int index = lo * 24; @@ -339,16 +339,16 @@ void unordered_sse_vaxOpbyz4_double_evaluate_function (int lo, int hi, int myId, for(int j=lo; j < hi; j++) { int i=tab[j]; - + REAL64 *xptr = (REAL64 *) &(x.elem(i).elem(0).elem(0).real()); REAL64 *yptr = (REAL64 *) &(y.elem(i).elem(0).elem(0).real()); REAL64* zptr = &(d.elem(i).elem(0).elem(0).real()); - + func(zptr, aptr, xptr, bptr, yptr, Ns); } - + } struct ordered_norm_double_user_arg { @@ -390,6 +390,7 @@ inline void ordered_inner_product_double_func(int lo, int hi, int myId, ordered_ func( &(a->results[2*myId]), xptr, yptr, nvec); } +} // namespace QDP; #endif diff --git a/include/scalarsite_sse/qdp_scalarsite_sse_blas_g5.h b/include/scalarsite_sse/qdp_scalarsite_sse_blas_g5.h index c241798fd..7ba98fae9 100644 --- a/include/scalarsite_sse/qdp_scalarsite_sse_blas_g5.h +++ b/include/scalarsite_sse/qdp_scalarsite_sse_blas_g5.h @@ -12,8 +12,6 @@ #include "scalarsite_sse/qdp_scalarsite_sse_blas_g5_includes.h" -using namespace QDP; - namespace QDP { // Types needed for the expression templates. diff --git a/include/scalarsite_sse/qdp_scalarsite_sse_blas_wrapper.h b/include/scalarsite_sse/qdp_scalarsite_sse_blas_wrapper.h index 6914e1599..9961a5127 100644 --- a/include/scalarsite_sse/qdp_scalarsite_sse_blas_wrapper.h +++ b/include/scalarsite_sse/qdp_scalarsite_sse_blas_wrapper.h @@ -1,6 +1,8 @@ #ifndef QDP_SCALARSITE_SSE_BLAS_WRAPPER_H #define QDP_SCALARSITE_SSE_BLAS_WRAPPER_H +namespace QDP { + //////////////////////////////// // Threading evaluates wrappers // @@ -86,7 +88,7 @@ void unordered_sse_vaxOpy3_y_evaluate_function (int lo, int hi, int myId, unorde func(yptr, scalep, yptr, xptr, 1); } } - + } @@ -230,7 +232,7 @@ void unordered_vOp_z_evaluate_function (int lo, int hi, int myId, unordered_sse_ REAL32 *xptr = (REAL32 *) &(x.elem(i).elem(0).elem(0).real()); REAL32 *yptr = (REAL32 *) &(y.elem(i).elem(0).elem(0).real()); REAL32* zptr = &(d.elem(i).elem(0).elem(0).real()); - + // Get the no of 3vecs. s.start() and s.end() are inclusive so add +1 func(zptr, xptr, yptr, 1); @@ -374,9 +376,9 @@ void unordered_sse_vaxOpby3_evaluate_function (int lo, int hi, int myId, unorder REAL32 *xptr = (REAL32 *) &(x.elem(i).elem(0).elem(0).real()); REAL32 *yptr = (REAL32 *) &(y.elem(i).elem(0).elem(0).real()); REAL32 * zptr = &(d.elem(i).elem(0).elem(0).real()); - + func(zptr, aptr, xptr, bptr, yptr, 1); - + } } @@ -398,4 +400,6 @@ inline void ordered_norm_single_func(int lo, int hi, int myId, ordered_sse_norm_ func( &(a->results[myId]), vptr, nvec); } +} // namespace QDP; + #endif diff --git a/include/scalarsite_sse/qdp_scalarsite_sse_linalg.h b/include/scalarsite_sse/qdp_scalarsite_sse_linalg.h index 931700600..9271ead52 100644 --- a/include/scalarsite_sse/qdp_scalarsite_sse_linalg.h +++ b/include/scalarsite_sse/qdp_scalarsite_sse_linalg.h @@ -13,20 +13,10 @@ // These SSE asm instructions are only supported under GCC/G++ #if defined(__GNUC__) -namespace QDP { - - // #define QDP_SCALARSITE_DEBUG - #define QDP_SCALARSITE_USE_EVALUATE - -/*! @defgroup optimizations Optimizations - * - * Optimizations for basic QDP operations - * - * @{ - */ +namespace QDP { // Use this def just to safe some typing later on in the file typedef RComplex RComplexFloat; @@ -36,7 +26,7 @@ typedef PScalar > TCol; typedef PSpinVector, 3>, 2> TVec2; typedef PSpinVector, 3>, 4> TVec4; - +} // namespace QDP #include "sse_mult_su3_nn.h" #include "sse_mult_su3_an.h" @@ -48,6 +38,14 @@ typedef PSpinVector, 3>, 4> TVec4; #include "sse_mult_su3_mat_hwvec.h" #include "sse_mult_adj_su3_mat_hwvec.h" +namespace QDP { + +/*! @defgroup optimizations Optimizations + * + * Optimizations for basic QDP operations + * + * @{ + */ // Optimized version of // PColorMatrix <- PColorMatrix * PColorMatrix diff --git a/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates.h b/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates.h index b78441319..0b3a23671 100644 --- a/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates.h +++ b/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates.h @@ -3,11 +3,15 @@ #include "sse_mult_adj_su3_mat_hwvec.h" -/* Evaluates for things like adj(u)*spinProjectDir0Plus(y) */ -using namespace QDP; namespace QDP { - typedef PScalar< PColorMatrix< RComplex, 3> > SU3Mat32; +} // namespace QDP + +// the wrappers for the functions to be threaded +#include "qdp_sse_fused_spin_proj_evaluates_wrapper.h" + +/* Evaluates for things like adj(u)*spinProjectDir0Plus(y) */ +namespace QDP { //////////////////////////////// @@ -16,9 +20,6 @@ typedef PScalar< PColorMatrix< RComplex, 3> > SU3Mat32; // by Xu Guo, EPCC, 20 October, 2008 //////////////////////////////// -// the wrappers for the functions to be threaded -#include "qdp_sse_fused_spin_proj_evaluates_wrapper.h" - // HalfVec = adj(u)*SpinProjectDir0Plus(Vec); template<> diff --git a/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates_wrapper.h b/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates_wrapper.h index 686f742e4..a4777ce32 100644 --- a/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates_wrapper.h +++ b/include/scalarsite_sse/qdp_sse_fused_spin_proj_evaluates_wrapper.h @@ -1,6 +1,8 @@ #ifndef QDP_SSE_FUSED_SPIN_PROJ_EVALUATES_WRAPPER_H #define QDP_SSE_FUSED_SPIN_PROJ_EVALUATES_WRAPPER_H +namespace QDP { + //////////////////////////////// // Threading evaluates wrappers // @@ -35,11 +37,11 @@ void ordered_sse_fused_spin_proj_evaluate_function (int lo, int hi, int myId, or func( (REAL32 *)&(a.elem(site).elem(0).elem(0).real()), (REAL32 *)&(tmp.elem(0).elem(0).real()), 1); - + su3_matrixf* um = (su3_matrixf *)&(u.elem(site).elem().elem(0,0).real()); half_wilson_vectorf *tmph = (half_wilson_vectorf *)&( tmp.elem(0).elem(0).real()); half_wilson_vectorf *dh = (half_wilson_vectorf *)&( d.elem(site).elem(0).elem(0).real()); - + intrin_sse_mult_adj_su3_mat_hwvec(um, tmph, dh); } @@ -72,16 +74,16 @@ void unordered_sse_fused_spin_proj_evaluate_function (int lo, int hi, int myId, func( (REAL32 *)&(a.elem(site).elem(0).elem(0).real()), (REAL32 *)&(tmp.elem(0).elem(0).real()), 1); - + su3_matrixf* um = (su3_matrixf *)&(u.elem(site).elem().elem(0,0).real()); half_wilson_vectorf *tmph = (half_wilson_vectorf *)&( tmp.elem(0).elem(0).real()); half_wilson_vectorf *dh = (half_wilson_vectorf *)&( d.elem(site).elem(0).elem(0).real()); - + intrin_sse_mult_adj_su3_mat_hwvec(um, tmph, dh); } } - +} // namespace QDP #endif diff --git a/include/scalarsite_sse/qdp_sse_fused_spin_recon_evaluates_wrapper.h b/include/scalarsite_sse/qdp_sse_fused_spin_recon_evaluates_wrapper.h index ce461bcac..1eef8e6d7 100644 --- a/include/scalarsite_sse/qdp_sse_fused_spin_recon_evaluates_wrapper.h +++ b/include/scalarsite_sse/qdp_sse_fused_spin_recon_evaluates_wrapper.h @@ -1,6 +1,8 @@ #ifndef QDP_SSE_FUSED_SPIN_RECON_EVALUATES_WRAPPER_H #define QDP_SSE_FUSED_SPIN_RECON_EVALUATES_WRAPPER_H +namespace QDP { + //////////////////////////////// // Threading evaluates wrappers // @@ -39,8 +41,8 @@ void ordered_sse_fused_spin_recon_evaluate_function (int lo, int hi, int myId, o half_wilson_vectorf *tmph = (half_wilson_vectorf *)&( tmp.elem(0).elem(0).real()); intrin_sse_mult_su3_mat_hwvec(um, ah, tmph); - - + + func( (REAL32 *)&(tmp.elem(0).elem(0).real()), (REAL32 *)&(d.elem(site).elem(0).elem(0).real()), 1); @@ -73,7 +75,7 @@ void unordered_sse_fused_spin_recon_evaluate_function (int lo, int hi, int myId, for (int j = lo; j < hi; j++){ int site=tab[j]; - + HVec32 tmp ; su3_matrixf* um = (su3_matrixf *)&(u.elem(site).elem().elem(0,0).real()); @@ -81,8 +83,8 @@ void unordered_sse_fused_spin_recon_evaluate_function (int lo, int hi, int myId, half_wilson_vectorf *tmph = (half_wilson_vectorf *)&( tmp.elem(0).elem(0).real()); intrin_sse_mult_su3_mat_hwvec(um, ah, tmph); - - + + func( (REAL32 *)&(tmp.elem(0).elem(0).real()), (REAL32 *)&(d.elem(site).elem(0).elem(0).real()), 1); @@ -90,5 +92,7 @@ void unordered_sse_fused_spin_recon_evaluate_function (int lo, int hi, int myId, } +} // namespace QDP + #endif diff --git a/include/scalarsite_sse/qdp_sse_spin_evaluates.h b/include/scalarsite_sse/qdp_sse_spin_evaluates.h index 80fe29915..f21a5877c 100644 --- a/include/scalarsite_sse/qdp_sse_spin_evaluates.h +++ b/include/scalarsite_sse/qdp_sse_spin_evaluates.h @@ -1,7 +1,6 @@ #ifndef QDP_SSE_SPIN_EVALUATES_H #define QDP_SSE_SPIN_EVALUATES_H -using namespace QDP; namespace QDP { // Typedefs @@ -15,16 +14,19 @@ typedef REAL32 SpinColFull[4][3][2]; typedef REAL32 SpinColHalf[2][3][2]; // d = SpinProjectDir0Plus(Vec); +} // namespace QDP; + +// the wrappers for the functions to be threaded +#include "qdp_sse_spin_evaluates_wrapper.h" + +namespace QDP { + //////////////////////////////// // Threading evaluates // // by Xu Guo, EPCC, 13 October, 2008 //////////////////////////////// -// the wrappers for the functions to be threaded -#include "qdp_sse_spin_evaluates_wrapper.h" - - template inline diff --git a/include/scalarsite_sse/qdp_sse_spin_evaluates_wrapper.h b/include/scalarsite_sse/qdp_sse_spin_evaluates_wrapper.h index 59a49a2b7..88e209b7a 100644 --- a/include/scalarsite_sse/qdp_sse_spin_evaluates_wrapper.h +++ b/include/scalarsite_sse/qdp_sse_spin_evaluates_wrapper.h @@ -1,6 +1,8 @@ #ifndef QDP_SSE_SPIN_EVALUATES_WRAPPER_H #define QDP_SSE_SPIN_EVALUATES_WRAPPER_H +namespace QDP { + //////////////////////////////// // Threading evaluates wrappers // @@ -65,7 +67,6 @@ void unordered_sse_spin_project_evaluate_function (int lo, int hi, int myId, uno } - - +} // namespace QDP #endif diff --git a/include/scalarsite_sse/sse_fused_spin_proj.h b/include/scalarsite_sse/sse_fused_spin_proj.h index e7138cacb..12cb1c740 100644 --- a/include/scalarsite_sse/sse_fused_spin_proj.h +++ b/include/scalarsite_sse/sse_fused_spin_proj.h @@ -5,7 +5,6 @@ #include "sse_mult_adj_su3_mat_hwvec.h" #include -using namespace std; namespace QDP { From 0788979355d1a31558ca1bb0fc7bf035df12281b Mon Sep 17 00:00:00 2001 From: Jonas Glesaaen Date: Mon, 14 May 2018 14:09:53 +0100 Subject: [PATCH 5/5] Moved includes outside of namespaces in scalarsite_bagel and scalarvecsite --- .../scalarsite_bagel_qdp/qdp_scalarsite_bagel_qdp_linalg.h | 6 ++---- include/scalarvecsite_sse/qdp_scalarvecsite_sse_blas.h | 6 +++--- include/scalarvecsite_sse/qdp_scalarvecsite_sse_linalg.h | 6 +++--- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/include/scalarsite_bagel_qdp/qdp_scalarsite_bagel_qdp_linalg.h b/include/scalarsite_bagel_qdp/qdp_scalarsite_bagel_qdp_linalg.h index c123896be..feedc124f 100644 --- a/include/scalarsite_bagel_qdp/qdp_scalarsite_bagel_qdp_linalg.h +++ b/include/scalarsite_bagel_qdp/qdp_scalarsite_bagel_qdp_linalg.h @@ -10,6 +10,8 @@ #ifndef QDP_SCALARSITE_BAGEL_QDP_LINALG_H #define QDP_SCALARSITE_BAGEL_QDP_LINALG_H +#include "bagel_qdp.h" + namespace QDP { /*! @defgroup optimizations Optimizations @@ -21,10 +23,6 @@ namespace QDP { // Use this def just to safe some typing later on in the file - - -#include "bagel_qdp.h" - #if 1 typedef RComplex RComplexFloat; diff --git a/include/scalarvecsite_sse/qdp_scalarvecsite_sse_blas.h b/include/scalarvecsite_sse/qdp_scalarvecsite_sse_blas.h index 3eac91d6a..693565ee8 100755 --- a/include/scalarvecsite_sse/qdp_scalarvecsite_sse_blas.h +++ b/include/scalarvecsite_sse/qdp_scalarvecsite_sse_blas.h @@ -9,13 +9,13 @@ #ifndef QDP_SCALARVECSITE_SSE_BLAS_H #define QDP_SCALARVECSITE_SSE_BLAS_H +#define QDP_SCALARVECSITE_USE_EVALUATE +#include "qdp_sse_intrin.h" + namespace QDP { // #define QDP_SCALARVECSITE_BLAS_DEBUG -#define QDP_SCALARVECSITE_USE_EVALUATE -#include "qdp_sse_intrin.h" - typedef PScalar >, 3> > TCMat; typedef PScalar >, 3> > TCVec; typedef PSpinVector >, 3>, 4> TDirac; diff --git a/include/scalarvecsite_sse/qdp_scalarvecsite_sse_linalg.h b/include/scalarvecsite_sse/qdp_scalarvecsite_sse_linalg.h index b71b961ac..ce7ea0eab 100755 --- a/include/scalarvecsite_sse/qdp_scalarvecsite_sse_linalg.h +++ b/include/scalarvecsite_sse/qdp_scalarvecsite_sse_linalg.h @@ -15,11 +15,13 @@ #include "qdp_sse_intrin.h" +#define QDP_SCALARVECSITE_USE_EVALUATE +#include "scalarvecsite_sse/ssevec_mult_nn.h" + namespace QDP { // #define QDP_SCALARVECSITE_DEBUG -#define QDP_SCALARVECSITE_USE_EVALUATE @@ -35,8 +37,6 @@ typedef IScalar IScalarFloat; typedef ILattice ILatticeFloat; typedef RComplex > RComplexFloat; -#include "scalarvecsite_sse/ssevec_mult_nn.h" - //-------------------------------------------------------------------------------------- // Optimized version of // ILatticeFloat <- ILatticeFloat + ILatticeFloat