Skip to content

Commit

Permalink
Merge pull request #434 from lattice/hotfix/0.8_update
Browse files Browse the repository at this point in the history
added date to NEWS,README file and fixed some typos
  • Loading branch information
mathiaswagner committed Feb 1, 2016
2 parents d38ed85 + d744cc1 commit 9425ca6
Show file tree
Hide file tree
Showing 8 changed files with 82 additions and 38 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

Copyright (c) 2009-2015 QUDA Developers
Copyright (c) 2009-2016 QUDA Developers

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
11 changes: 6 additions & 5 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
Version 0.8.0 - xxth December 2015
Version 0.8.0 - 1st February 2016

- Removed all Tesla-generation GPU support from QUDA (sm_1x). As a
result, QUDA now requires a minimum of the Fermi-generation GPUs.

- Added support for building QUDA using cmake. This gives a much more
flexible and extensible build system as well as allowing
out-of-source-directory building.
out-of-source-directory building. For details see:
https://github.com/lattice/quda/wiki/Building-QUDA-with-cmake

- Improved strong scaling of the multi-shift solver by overlapping the
shift updates with the subsequent iteration's dslash comms waiting.

- Improved performance of multi-shift solver by preventing unecessary
- Improved performance of multi-shift solver by preventing unnecessary
refinement of shifted solutions once the residual falls below
floating point precision.

Expand Down Expand Up @@ -45,9 +46,9 @@ Version 0.8.0 - xxth December 2015
force kernels. This also improves compilation time and reduces
library size.

- Added support for imaginary chemical potential to the staggeed phase
- Added support for imaginary chemical potential to the staggered phase
application / removal kernel, as well as fixing bugs in this
reoutine.
routine.

- Algorithms that previously used double-precision atomics now use a
cub reduction. This drastically improves performance of such
Expand Down
2 changes: 1 addition & 1 deletion README
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Release Notes for QUDA v0.8.0 xxth December 2015
Release Notes for QUDA v0.8.0 1st February 2016
-----------------------------

Overview:
Expand Down
14 changes: 8 additions & 6 deletions include/quda.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,14 @@ extern "C" {

int overlap; /**< Width of overlapping domains */

int use_resident_gauge; /**< Use the resident gauge field */
int use_resident_mom; /**< Use the resident mom field */
int make_resident_gauge; /**< Make the gauge field resident */
int make_resident_mom; /**< Make the mom field resident */
int return_gauge; /**< Return the new gauge field */
int return_mom; /**< Return the new mom field */
int overwrite_mom; /**< When computing momentum, should we overwrite it or accumulate to to */

int use_resident_gauge; /**< Use the resident gauge field as input */
int use_resident_mom; /**< Use the resident momentum field as input*/
int make_resident_gauge; /**< Make the result gauge field resident */
int make_resident_mom; /**< Make the result momentum field resident */
int return_result_gauge; /**< Return the result gauge field */
int return_result_mom; /**< Return the result momentum field */

} QudaGaugeParam;

Expand Down
10 changes: 6 additions & 4 deletions lib/check_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,19 +109,21 @@ void printQudaGaugeParam(QudaGaugeParam *param) {
#endif

#if defined INIT_PARAM
P(overwrite_mom, 0);
P(use_resident_gauge, 0);
P(use_resident_mom, 0);
P(make_resident_gauge, 0);
P(make_resident_mom, 0);
P(return_gauge, 1);
P(return_mom, 1);
P(return_result_gauge, 1);
P(return_result_mom, 1);
#else
P(overwrite_mom, INVALID_INT);
P(use_resident_gauge, INVALID_INT);
P(use_resident_mom, INVALID_INT);
P(make_resident_gauge, INVALID_INT);
P(make_resident_mom, INVALID_INT);
P(return_gauge, INVALID_INT);
P(return_mom, INVALID_INT);
P(return_result_gauge, INVALID_INT);
P(return_result_mom, INVALID_INT);
#endif

#ifdef INIT_PARAM
Expand Down
34 changes: 20 additions & 14 deletions lib/interface_quda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,6 @@ extern char* gitversion;
* Set the device that QUDA uses.
*/
void initQudaDevice(int dev) {

//static bool initialized = false;
if (initialized) return;
initialized = true;
Expand Down Expand Up @@ -434,13 +433,17 @@ void initQudaDevice(int dev) {
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
//cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
cudaGetDeviceProperties(&deviceProp, dev);

profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
}

/*
* Any persistent memory allocations that QUDA uses are done here.
*/
void initQudaMemory()
{
profileInit.TPSTART(QUDA_PROFILE_TOTAL);

if (!comms_initialized) init_default_comms();

streams = new cudaStream_t[Nstream];
Expand Down Expand Up @@ -470,6 +473,8 @@ void initQudaMemory()
cudaHostGetDevicePointer(&num_failures_d, num_failures_h, 0);

loadTuneCache(getVerbosity());

profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
}

void initQuda(int dev)
Expand All @@ -489,8 +494,6 @@ void initQuda(int dev)
pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);
pthread_mutex_init(&pthread_mutex, &mutex_attr);
#endif

profileInit.TPSTOP(QUDA_PROFILE_TOTAL);
}


Expand Down Expand Up @@ -3391,15 +3394,17 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int
if (qudaGaugeParam->use_resident_mom) {
if (!gaugePrecise) errorQuda("No resident momentum field to use");
cudaMom = momResident;
if (qudaGaugeParam->overwrite_mom) cudaMom->zero();
profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
} else {
gParamMom.create = QUDA_ZERO_FIELD_CREATE;
gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE;
gParamMom.order = QUDA_FLOAT2_GAUGE_ORDER;
gParamMom.reconstruct = QUDA_RECONSTRUCT_10;
gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS;
gParamMom.precision = qudaGaugeParam->cuda_prec;
gParamMom.create = QUDA_ZERO_FIELD_CREATE;
cudaMom = new cudaGaugeField(gParamMom);
if (!qudaGaugeParam->overwrite_mom) cudaMom->loadCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT);
}

Expand All @@ -3409,7 +3414,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int
path_length, loop_coeff, num_paths, max_length);
profileGaugeForce.TPSTOP(QUDA_PROFILE_COMPUTE);

if (qudaGaugeParam->return_mom) {
if (qudaGaugeParam->return_result_mom) {
profileGaugeForce.TPSTART(QUDA_PROFILE_D2H);
cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H);
Expand Down Expand Up @@ -4318,10 +4323,10 @@ computeHISQForceQuda(void* const milc_momentum,
updateMomentum(*momResident, 1.0, *cudaMom);
}

if (gParam->return_mom) {
if (gParam->return_result_mom) {
profileHISQForce.TPSTART(QUDA_PROFILE_D2H);
// Close the paths, make anti-hermitian, and store in compressed format
if (gParam->return_mom) cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
if (gParam->return_result_mom) cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
profileHISQForce.TPSTOP(QUDA_PROFILE_D2H);
}

Expand Down Expand Up @@ -4365,7 +4370,7 @@ void computeStaggeredOprodQuda(void** oprod,

#ifdef GPU_STAGGERED_OPROD
#ifndef BUILD_QDP_INTERFACE
#error "Staggerd oprod requires BUILD_QDP_INTERFACE";
#error "Staggered oprod requires BUILD_QDP_INTERFACE";
#endif
using namespace quda;
profileStaggeredOprod.TPSTART(QUDA_PROFILE_TOTAL);
Expand Down Expand Up @@ -4825,7 +4830,8 @@ void updateGaugeFieldQuda(void* gauge,
gParam.reconstruct = QUDA_RECONSTRUCT_NO;
gParam.gauge = gauge;
gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO;
cpuGaugeField *cpuGauge = !param->use_resident_gauge ? new cpuGaugeField(gParam) : NULL;
bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;

gParam.reconstruct = gParam.order == QUDA_TIFR_GAUGE_ORDER ?
QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10;
Expand Down Expand Up @@ -4875,7 +4881,7 @@ void updateGaugeFieldQuda(void* gauge,
(bool)conj_mom, (bool)exact);
profileGaugeUpdate.TPSTOP(QUDA_PROFILE_COMPUTE);

if (param->return_gauge) {
if (param->return_result_gauge) {
// copy the gauge field back to the host
profileGaugeUpdate.TPSTART(QUDA_PROFILE_D2H);
cudaOutGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
Expand Down Expand Up @@ -4923,7 +4929,7 @@ void updateGaugeFieldQuda(void* gauge,
gParam.reconstruct = QUDA_RECONSTRUCT_NO;
gParam.link_type = QUDA_GENERAL_LINKS;
gParam.gauge = gauge_h;
bool need_cpu = !param->use_resident_gauge || param->return_gauge;
bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;

// create the device fields
Expand Down Expand Up @@ -4954,7 +4960,7 @@ void updateGaugeFieldQuda(void* gauge,
errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h);

profileProject.TPSTART(QUDA_PROFILE_D2H);
if (param->return_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
profileProject.TPSTOP(QUDA_PROFILE_D2H);

if (param->make_resident_gauge) {
Expand Down Expand Up @@ -4985,7 +4991,7 @@ void updateGaugeFieldQuda(void* gauge,
gParam.reconstruct = QUDA_RECONSTRUCT_NO;
gParam.link_type = QUDA_GENERAL_LINKS;
gParam.gauge = gauge_h;
bool need_cpu = !param->use_resident_gauge || param->return_gauge;
bool need_cpu = !param->use_resident_gauge || param->return_result_gauge;
cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : NULL;

// create the device fields
Expand Down Expand Up @@ -5014,7 +5020,7 @@ void updateGaugeFieldQuda(void* gauge,
profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE);

profilePhase.TPSTART(QUDA_PROFILE_D2H);
if (param->return_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge, QUDA_CPU_FIELD_LOCATION);
profilePhase.TPSTOP(QUDA_PROFILE_D2H);

if (param->make_resident_gauge) {
Expand Down
17 changes: 13 additions & 4 deletions lib/milc_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,11 +293,11 @@ void qudaHisqForce(int prec, const double level2_coeff[6], const double fat7_coe
if (!invalidate_quda_mom) {
gParam.use_resident_mom = true;
gParam.make_resident_mom = true;
gParam.return_mom = false;
gParam.return_result_mom = false;
} else {
gParam.use_resident_mom = false;
gParam.make_resident_mom = false;
gParam.return_mom = true;
gParam.return_result_mom = true;
}

long long flops;
Expand Down Expand Up @@ -546,11 +546,20 @@ void qudaGaugeForce( int precision,
if (!invalidate_quda_mom) {
qudaGaugeParam.use_resident_mom = true;
qudaGaugeParam.make_resident_mom = true;
qudaGaugeParam.return_mom = false;
qudaGaugeParam.return_result_mom = false;

// this means when we compute the momentum, we acummulate to the
// preexisting resident momentum instead of overwriting it
qudaGaugeParam.overwrite_mom = false;
} else {
qudaGaugeParam.use_resident_mom = false;
qudaGaugeParam.make_resident_mom = false;
qudaGaugeParam.return_mom = true;
qudaGaugeParam.return_result_mom = true;

// this means we compute momentum into a fresh field, copy it back
// and sum to current momentum in MILC. This saves an initial
// CPU->GPU download of the current momentum.
qudaGaugeParam.overwrite_mom = true;
}

int max_length = 6;
Expand Down
30 changes: 27 additions & 3 deletions lib/quda_fortran.F90
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,21 @@ module quda_fortran
! Whether the staggered phase has already been applied to the links
integer(4) :: staggered_phase_applied

! Imaginary chemical potential
real(8) :: i_mu

integer(4) :: overlap ! width of domain overlap

! When computing momentum, should we overwrite it or accumulate
! to it (only presenty support in gauge-force)
integer(4) :: overwrite_mom

integer(4) :: use_resident_gauge ! Use the resident gauge field
integer(4) :: use_resident_mom ! Use the resident mom field
integer(4) :: make_resident_gauge ! Make the gauge field resident
integer(4) :: make_resident_mom ! Make the mom field resident
integer(4) :: use_resident_mom ! Use the resident momentume field
integer(4) :: make_resident_gauge ! Make the result gauge field resident
integer(4) :: make_resident_mom ! Make the result momentum field resident
integer(4) :: return_result_gauge ! Return the result gauge field
integer(4) :: return_result_mom ! Return the result momentum field

end type quda_gauge_param

Expand Down Expand Up @@ -114,6 +123,9 @@ module quda_fortran
! Actual L2 residual norm achieved in solver for each offset
real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: true_res_offset

! Iterated L2 residual achieved in multi shift solver for each offset
real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: iter_res_offset

! Actual heavy quark residual norm achieved in solver for each offset
real(8), dimension(QUDA_MAX_MULTI_SHIFT) :: true_res_hq_offset

Expand Down Expand Up @@ -206,6 +218,18 @@ module quda_fortran
integer(4)::max_search_dim ! for magma library this parameter must be multiple 16?
integer(4)::rhs_idx
integer(4)::deflation_grid !total deflation space is nev*deflation_grid
integer(4)::use_reduced_vector_set ! eigCG: specifies whether to use reduced eigenvector set
real(8):: eigenval_tol ! eigCG: selection criterion for the reduced eigenvector set
integer(4)::use_cg_updates ! mixed precision eigCG:whether to use cg refinement corrections in the incremental stage
real(8)::cg_iterref_tol ! mixed precision eigCG: tolerance for cg refinement corrections in the incremental stage
integer(4)::eigcg_max_restarts ! mixed precision eigCG tuning parameter: minimum search vector space restarts
integer(4)::max_restart_num ! initCG tuning parameter: maximum restarts
real(8)::inc_tol ! initCG tuning parameter: decrease in absolute value of the residual within each restart cycle

! Parameters for setting data residency of the solver
integer(8)::make_resident_solution ! Whether to make the solution vector(s) after the solve
integer(8)::use_resident_solution ! Whether to use the resident solution vector(s)

end type quda_invert_param

end module quda_fortran
Expand Down

0 comments on commit 9425ca6

Please sign in to comment.