From 3ece1425a749b1ff0a1e7e2fec3731656a035d30 Mon Sep 17 00:00:00 2001 From: eric bylaska Date: Mon, 20 May 2024 13:58:28 -0700 Subject: [PATCH] memory leak bug fix...EJB --- Nwpw/nwpwlib/C3dB/CBalance.cpp | 23 +- Nwpw/nwpwlib/C3dB/CGrid.cpp | 796 +++++++++++---------- Nwpw/nwpwlib/C3dB/c3db.cpp | 62 +- Nwpw/nwpwlib/D3dB/d3db.cpp | 180 ++--- Nwpw/nwpwlib/device/gdevice2.cpp | 10 +- Nwpw/nwpwlib/lattice/Balance.cpp | 50 +- Nwpw/nwpwlib/lattice/PGrid.cpp | 1127 +++++++++++++++++++----------- Nwpw/nwpwlib/utilities/util.hpp | 6 + 8 files changed, 1348 insertions(+), 906 deletions(-) diff --git a/Nwpw/nwpwlib/C3dB/CBalance.cpp b/Nwpw/nwpwlib/C3dB/CBalance.cpp index 345e69df..1b0a29e3 100644 --- a/Nwpw/nwpwlib/C3dB/CBalance.cpp +++ b/Nwpw/nwpwlib/C3dB/CBalance.cpp @@ -295,7 +295,7 @@ CBalance::~CBalance() /*********************************** * * - * CBalance::c_unbalance_start * + * CBalance::c_unbalance_start * * * ***********************************/ /** @@ -311,24 +311,23 @@ CBalance::~CBalance() */ void CBalance::c_unbalance_start(const int nffts, const int nb, double *a, const int request_indx, const int msgtype) { - int j, pto, pfrom, msglen, indx; - + //int j, pto, pfrom, msglen, indx; if (sender_list[nb]) - for (j = 0; j < npacket_list[nb]; ++j) + for (auto j=0; j 0) parall->adreceive(request_indx, msgtype, pfrom, msglen, a + indx); } if (receiver_list[nb]) - for (j = 0; j < npacket_list[nb]; ++j) + for (auto j=0; j 0) parall->adsend(request_indx, msgtype, pto, msglen, a + indx); } @@ -357,7 +356,7 @@ void CBalance::c_unbalance_end(const int nffts, const int nb, double *a, const i /******************************** * * - * CBalance::c_unbalance * + * CBalance::c_unbalance * * * ********************************/ /** diff --git a/Nwpw/nwpwlib/C3dB/CGrid.cpp b/Nwpw/nwpwlib/C3dB/CGrid.cpp index 3c9362fb..3f8b8779 100644 --- a/Nwpw/nwpwlib/C3dB/CGrid.cpp +++ b/Nwpw/nwpwlib/C3dB/CGrid.cpp @@ -1383,7 +1383,8 @@ void CGrid::c_unpack_start(const int nffts, const int nb, double *tmp1, double * const int request_indx, const int msgtype) { if (balanced) - mybalance->c_unbalance_start(nffts, nb, tmp1, request_indx, msgtype); + for (auto s=0; sc_unbalance_start(nffts, nb, tmp1+s*n2ft3d, request_indx, msgtype); } /******************************** @@ -1397,10 +1398,11 @@ void CGrid::c_unpack_mid(const int nffts, const int nb, double *tmp1, double *tm if (balanced) mybalance->c_unbalance_end(nffts, nb, tmp1, request_indx); - std::memcpy(tmp2, tmp1, 2 * (nidb2[nb]) * sizeof(double)); - std::memset(tmp1, 0, 2*nfft3d * sizeof(double)); + std::memcpy(tmp2, tmp1, nffts*2*(nidb2[nb])*sizeof(double)); + std::memset(tmp1, 0, nffts*2*nfft3d*sizeof(double)); - c_bindexcopy((nidb2[nb]), packarray[nb], tmp2, tmp1); + for (auto s=0; sastart(request_indx,parall->np_i()); // unpack start, tmp1-->tmp1 - std::memcpy(tmp1, a, 2 * (nidb[nb]) * sizeof(double)); + for (auto s=0; sc_unpack_start(nffts, nb, tmp1, tmp2, request_indx, 47); } else if (step == 1) @@ -1715,36 +1733,39 @@ void CGrid::pfftbz_start(const int nffts, const int nb, double *tmp1, double *tm *** A(kx,nz,ky) <- fft1d^(-1)[A(kx,kz,ky)] *** ***************************************************/ int indx0 = 0; - int indx2 = 0; int nn = 0; - for (auto q=0; qastart(request_indx,parall->np_i()); // unpack start, tmp1-->tmp1 - std::memcpy(tmp1, a, 2 * (nidb[nb]) * sizeof(double)); + for (auto s=0; sc_unpack_start(nffts, nb, tmp1, tmp2, request_indx, 47); } else if (step == 1) @@ -2374,8 +2419,8 @@ void CGrid::pfftfx(const int nffts, const int nb, double *a, double *tmp1, doubl { // do fft along nx dimension //c3db::mygdevice.batch_cfftx_tmpx(c3db::fft_tag,true, nx, ny*nq, 2*nfft3d, a, c3db::tmpx); - c3db::mygdevice.batch_cfft(c3db::fft_tag,true,nx,ny*nq,nx,a,c3db::forward_x,c3db::tmpx,0); - std::memcpy(tmp1, a, 2*nfft3d * sizeof(double)); + c3db::mygdevice.batch_cfft(c3db::fft_tag,true,nx,nffts*ny*nq,nx,a,c3db::forward_x,c3db::tmpx,0); + std::memcpy(tmp1, a, nffts*2*nfft3d * sizeof(double)); } /**** hilbert mapping ****/ else @@ -2383,8 +2428,8 @@ void CGrid::pfftfx(const int nffts, const int nb, double *a, double *tmp1, doubl // do fft along nx dimension // A(kx,ny,nz) <- fft1d[A(nx,ny,nz)] //c3db::mygdevice.batch_cfftx_tmpx(c3db::fft_tag,true, nx, nq1, 2*nfft3d, a, c3db::tmpx); - c3db::mygdevice.batch_cfft(c3db::fft_tag,true,nx,nq1,nx,a,c3db::forward_x,c3db::tmpx,0); - c3db::c_ptranspose_ijk_start(nffts, nb, 0, a, tmp1, tmp2, request_indx, 40); + c3db::mygdevice.batch_cfft(c3db::fft_tag,true,nx,nffts*nq1,nx,a,c3db::forward_x,c3db::tmpx,0); + c3db::c_ptranspose_ijk_start(nffts,nb,0,a,tmp1,tmp2,request_indx, 40); } } @@ -2403,58 +2448,64 @@ void CGrid::pfftfy(const int nffts, const int nb, double *tmp1, double *tmp2, in // do fft along ny dimension // A(kx,ky,nz) <- fft1d[A(kx,ny,nz)] int indx0 = 0; - int indx2 = 0; int nn = 0; - for (auto q=0; qc_balance_start(nffts, nb, a, request_indx, msgtype); + mybalance->c_balance_start(nffts, nb, a, request_indx, msgtype); return; } diff --git a/Nwpw/nwpwlib/C3dB/c3db.cpp b/Nwpw/nwpwlib/C3dB/c3db.cpp index a29bd4da..9c93dcf2 100644 --- a/Nwpw/nwpwlib/C3dB/c3db.cpp +++ b/Nwpw/nwpwlib/C3dB/c3db.cpp @@ -4181,10 +4181,11 @@ void c3db::c_ptranspose1_jk_start(const int nffts, const int nb, double *a, doub int msglen; int n1 = p_i1_start[nb][0][np]; - c_aindexcopy(n1, p_iq_to_i1[nb][0], a, tmp1); + for (auto s=0; s 0) parall->adreceive(request_indx, msgtype, proc_from, msglen, tmp2 + 2*p_i2_start[nb][0][it]); // parall->adreceive(request_indx,msgtype,proc_from,msglen,&tmp2[2*p_i2_start[nb][0][it]]); @@ -4200,7 +4201,7 @@ void c3db::c_ptranspose1_jk_start(const int nffts, const int nb, double *a, doub for (auto it=1; it 0) parall->adsend(request_indx, msgtype, proc_to, msglen, tmp1 + 2*p_i1_start[nb][0][it]); // parall->adsend(request_indx,msgtype,proc_to,msglen,&tmp1[2*p_i1_start[nb][0][it]]); @@ -4217,8 +4218,11 @@ void c3db::c_ptranspose1_jk_end(const int nffts, const int nb, double *a, double parall->awaitall(request_indx); int n2 = p_i2_start[nb][0][np]; - c_bindexcopy(n2, p_iq_to_i2[nb][0], tmp2, a); - c_bindexzero(nfft3d - n2, p_iz_to_i2[nb][0], a); + for (auto s=0; s 0) parall->adreceive(request_indx, msgtype, proc_from, msglen, tmp2 + 2*p_j2_start[nb][0][it]); } for (auto it = 1; it < np; ++it) { int proc_to = (taskid + it) % np; - msglen = 2 * (p_j1_start[nb][0][it + 1] - p_j1_start[nb][0][it]); + msglen = nffts*2*(p_j1_start[nb][0][it + 1] - p_j1_start[nb][0][it]); if (msglen > 0) parall->adsend(request_indx, msgtype, proc_to, msglen, tmp1 + 2*p_j1_start[nb][0][it]); } @@ -4269,8 +4273,11 @@ void c3db::c_ptranspose2_jk_end(const int nffts, const int nb, double *a, double parall->awaitall(request_indx); int n2 = p_j2_start[nb][0][np]; - c_bindexcopy(n2, p_jq_to_i2[nb][0], tmp2, a); - c_bindexzero(nfft3d - n2, p_jz_to_i2[nb][0], a); + for (auto s=0; stmp2 */ - c_aindexcopy(n1, p_iq_to_i1[nb][op], a, tmp1); + for (auto s=0; stmp1*/ - msglen = 2*(p_i2_start[nb][op][1] - p_i2_start[nb][op][0]); - std::memcpy(tmp2 + 2 * p_i2_start[nb][op][0], - tmp1 + 2 * p_i1_start[nb][op][0], msglen * sizeof(double)); + msglen = nffts*2*(p_i2_start[nb][op][1] - p_i2_start[nb][op][0]); + std::memcpy(tmp2 + 2*p_i2_start[nb][op][0], + tmp1 + 2*p_i1_start[nb][op][0], msglen*sizeof(double)); /* receive packed array data */ for (auto it=1; it 0) - parall->adreceive(request_indx, msgtype, proc_from, msglen, - tmp2 + 2 * p_i2_start[nb][op][it]); + parall->adreceive(request_indx, msgtype, proc_from, msglen, + tmp2 + 2*p_i2_start[nb][op][it]); // parall->adreceive(request_indx,msgtype,proc_from,msglen,&tmp2[2*p_i2_start[nb][op][it]]); } for (auto it=1; it 0) - parall->adsend(request_indx, msgtype, proc_to, msglen, - tmp1 + 2 * p_i1_start[nb][op][it]); + parall->adsend(request_indx, msgtype, proc_to, msglen, + tmp1 + 2*p_i1_start[nb][op][it]); // parall->adsend(request_indx,msgtype,proc_to,msglen,&tmp1[2*p_i1_start[nb][op][it]]); } } @@ -4332,10 +4340,10 @@ void c3db::c_ptranspose_ijk_end(const int nffts, const int nb, const int op, dou parall->awaitall(request_indx); /* unpack a array */ - for (auto i=0; i 0) - parall->adreceive(request_indx, msgtype, proc_from, msglen, - tmp2 + 2 * p_i2_start[nb][0][it]); + parall->adreceive(request_indx, msgtype, proc_from, msglen, + tmp2 + 2 * p_i2_start[nb][0][it]); // parall->adreceive(request_indx,msgtype,proc_from,msglen,&tmp2[2*p_i2_start[nb][0][it]]); } - for (it = 1; it < np; ++it) + for (it=1; it 0) - parall->adsend(request_indx, msgtype, proc_to, msglen, - tmp1 + 2 * p_i1_start[nb][0][it]); + parall->adsend(request_indx, msgtype, proc_to, msglen, + tmp1 + 2 * p_i1_start[nb][0][it]); // parall->adsend(request_indx,msgtype,proc_to,msglen,&tmp1[2*p_i1_start[nb][0][it]]); } } @@ -3815,8 +3814,11 @@ void d3db::c_ptranspose1_jk_end(const int nffts, const int nb, double *a, double parall->awaitall(request_indx); int n2 = p_i2_start[nb][0][np]; - c_bindexcopy(n2, p_iq_to_i2[nb][0], tmp2, a); - c_bindexzero(nfft3d - n2, p_iz_to_i2[nb][0], a); + for (auto s=0; s 0) - parall->adreceive(request_indx, msgtype, proc_from, msglen, - &tmp2[2 * p_j2_start[nb][0][it]]); - } - for (it = 1; it < np; ++it) { - proc_to = (taskid + it) % np; - msglen = 2 * (p_j1_start[nb][0][it + 1] - p_j1_start[nb][0][it]); - if (msglen > 0) - parall->adsend(request_indx, msgtype, proc_to, msglen, - &tmp1[2 * p_j1_start[nb][0][it]]); - } + const int msgtype) +{ + int it, proc_from, proc_to; + int msglen; + + int n1 = p_j1_start[nb][0][np]; + + for (auto s=0; s 0) + parall->adreceive(request_indx, msgtype, proc_from, msglen, + tmp2+2*p_j2_start[nb][0][it]); + } + for (it=1; it0) + parall->adsend(request_indx, msgtype, proc_to, msglen, + tmp1 + 2*p_j1_start[nb][0][it]); + } } /************************************** @@ -3865,12 +3870,16 @@ void d3db::c_ptranspose2_jk_start(const int nffts, const int nb, double *a, doub * * **************************************/ void d3db::c_ptranspose2_jk_end(const int nffts, const int nb, double *a, double *tmp2, - const int request_indx) { - parall->awaitall(request_indx); + const int request_indx) +{ + parall->awaitall(request_indx); - int n2 = p_j2_start[nb][0][np]; - c_bindexcopy(n2, p_jq_to_i2[nb][0], tmp2, a); - c_bindexzero(nfft3d - n2, p_jz_to_i2[nb][0], a); + int n2 = p_j2_start[nb][0][np]; + for (auto s=0; stmp2 */ - c_aindexcopy(n1, p_iq_to_i1[nb][op], a, tmp1); + for (auto s=0; stmp1*/ - msglen = 2 * (p_i2_start[nb][op][1] - p_i2_start[nb][op][0]); + msglen = nffts*2*(p_i2_start[nb][op][1] - p_i2_start[nb][op][0]); // int one=1; // DCOPY_PWDFT(msglen,&(tmp1[2*p_i1_start[nb][op][0]]),one,&(tmp2[2*p_i2_start[nb][op][0]]),one); // std::memcpy(&(tmp2[2*p_i2_start[nb][op][0]]),&(tmp1[2*p_i1_start[nb][op][0]]),msglen*sizeof(double)); - std::memcpy(tmp2 + 2 * p_i2_start[nb][op][0], - tmp1 + 2 * p_i1_start[nb][op][0], msglen * sizeof(double)); + std::memcpy(tmp2 + 2*p_i2_start[nb][op][0], + tmp1 + 2*p_i1_start[nb][op][0], msglen*sizeof(double)); /* receive packed array data */ - for (it = 1; it < np; ++it) + for (it=1; it 0) - parall->adreceive(request_indx, msgtype, proc_from, msglen, - tmp2 + 2 * p_i2_start[nb][op][it]); + parall->adreceive(request_indx, msgtype, proc_from, msglen, + tmp2 + 2 * p_i2_start[nb][op][it]); // parall->adreceive(request_indx,msgtype,proc_from,msglen,&tmp2[2*p_i2_start[nb][op][it]]); } - for (it = 1; it < np; ++it) + for (it=1; it 0) - parall->adsend(request_indx, msgtype, proc_to, msglen, - tmp1 + 2 * p_i1_start[nb][op][it]); + parall->adsend(request_indx, msgtype, proc_to, msglen, + tmp1 + 2 * p_i1_start[nb][op][it]); // parall->adsend(request_indx,msgtype,proc_to,msglen,&tmp1[2*p_i1_start[nb][op][it]]); } } @@ -3935,8 +3945,11 @@ void d3db::c_ptranspose_ijk_end(const int nffts, const int nb, const int op, dou parall->awaitall(request_indx); /* unpack a array */ - c_bindexcopy(n2, p_iq_to_i2[nb][op], tmp2, a); - c_bindexzero(n3, p_iz_to_i2[nb][op], a); + for (auto s=0; s 0) - parall->adreceive(request_indx, msgtype, proc_from, msglen, - &tmp2[2 * t_i2_start[it]]); + for (it = 1; it < np; ++it) + { + /* synchronous receive of tmp */ + proc_from = (taskid - it + np) % np; + msglen = nffts*2*(t_i2_start[it + 1] - t_i2_start[it]); + if (msglen > 0) + parall->adreceive(request_indx, msgtype, proc_from, msglen, tmp2 + 2*t_i2_start[it]); } - for (it = 1; it < np; ++it) { - proc_to = (taskid + it) % np; - msglen = 2 * (t_i1_start[it + 1] - t_i1_start[it]); - if (msglen > 0) - parall->adsend(request_indx, msgtype, proc_to, msglen, - &tmp1[2 * t_i1_start[it]]); + for (it = 1; it < np; ++it) + { + proc_to = (taskid + it) % np; + msglen = nffts*2*(t_i1_start[it + 1] - t_i1_start[it]); + if (msglen > 0) + parall->adsend(request_indx, msgtype, proc_to, msglen, tmp1 + 2*t_i1_start[it]); } } @@ -4566,7 +4579,8 @@ void d3db::c_timereverse_end(const int nffts, double *a, double *tmp1, double *t int nnfft3d = (t_i2_start[np] - t_i2_start[0] + 0); parall->awaitall(request_indx); - c_bindexcopy_conjg(nnfft3d, t_iq_to_i2 + indx, tmp2 + indx, a); + for (auto s=0; shasgpu) - mygdevice2->batch_cffty_stages(stage,tag,forward, ny, nq, n2ft3d, a,da); + mygdevice2->batch_cffty_stages(stage,tag,forward,ny,nq,n2ft3d,a,da); #endif } @@ -319,9 +319,9 @@ void gdevice2::batch_cfftz_tmpz(const int tag, bool forward, int nz, int nq, int double *a, double *tmpz) { #if defined(NWPW_CUDA) || defined(NWPW_HIP) if (mygdevice2->hasgpu) - mygdevice2->batch_cfftz(tag, forward, nz, nq, n2ft3d, a); + mygdevice2->batch_cfftz(tag,forward,nz,nq,n2ft3d,a); #else - mygdevice2->batch_cfftz_tmpz(forward, nz, nq, n2ft3d, a, tmpz); + mygdevice2->batch_cfftz_tmpz(forward,nz,nq,n2ft3d,a,tmpz); #endif } @@ -329,9 +329,9 @@ void gdevice2::batch_cfftz_tmpz_zero(const int tag, bool forward, int nz, int nq double *a, double *tmpz, bool *zero) { #if defined(NWPW_CUDA) || defined(NWPW_HIP) if (mygdevice2->hasgpu) - mygdevice2->batch_cfftz(tag,forward, nz, nq, n2ft3d, a); + mygdevice2->batch_cfftz(tag,forward,nz,nq,n2ft3d,a); #else - mygdevice2->batch_cfftz_tmpz_zero(forward, nz, nq, n2ft3d, a, tmpz, zero); + mygdevice2->batch_cfftz_tmpz_zero(forward,nz,nq,n2ft3d,a,tmpz,zero); #endif } diff --git a/Nwpw/nwpwlib/lattice/Balance.cpp b/Nwpw/nwpwlib/lattice/Balance.cpp index 815a72dd..2118dc6d 100644 --- a/Nwpw/nwpwlib/lattice/Balance.cpp +++ b/Nwpw/nwpwlib/lattice/Balance.cpp @@ -265,7 +265,7 @@ Balance::Balance(Parallel *inparall, const int maxsize0, const int *nidb, int *n */ Balance::~Balance() { - for (int nb = 0; nb < maxsize; ++nb) + for (int nb=0; nb 0) parall->adreceive(request_indx, msgtype, pfrom, msglen, a + indx); } if (receiver_list[nb]) - for (j = 0; j < npacket_list[nb]; ++j) + for (auto j=0; j 0) parall->adsend(request_indx, msgtype, pto, msglen, a + indx); } @@ -355,22 +353,24 @@ void Balance::c_unbalance(const int nb, double *a) { int j, pto, pfrom, msglen, indx; if (sender_list[nb]) - for (j = 0; j < npacket_list[nb]; ++j) { - pfrom = proc_to_list[nb][j]; - msglen = 2 * packet_size_list[nb][j]; - indx = 2 * indx_start_list[nb][j]; - if (msglen > 0) - parall->dreceive(1, 9, pfrom, msglen, &a[indx]); - } + for (auto j=0; j 0) + parall->dreceive(1, 9, pfrom, msglen, a + indx); + } if (receiver_list[nb]) - for (j = 0; j < npacket_list[nb]; ++j) { - pto = proc_from_list[nb][j]; - msglen = 2 * packet_size_list[nb][j]; - indx = 2 * indx_start_list[nb][j]; - if (msglen > 0) - parall->dsend(1, 9, pto, msglen, &a[indx]); - } + for (auto j=0; j 0) + parall->dsend(1, 9, pto, msglen, a + indx); + } } /******************************** diff --git a/Nwpw/nwpwlib/lattice/PGrid.cpp b/Nwpw/nwpwlib/lattice/PGrid.cpp index d842a613..486c5218 100644 --- a/Nwpw/nwpwlib/lattice/PGrid.cpp +++ b/Nwpw/nwpwlib/lattice/PGrid.cpp @@ -1505,7 +1505,10 @@ void PGrid::c_unpack_start(const int nffts, const int nb, double *tmp1, double * const int request_indx, const int msgtype) { if (balanced) - mybalance->c_unbalance_start(nffts, nb, tmp1, request_indx, msgtype); + { + for (auto s=0; sc_unbalance_start(nffts, nb, tmp1+s*n2ft3d, request_indx, msgtype); + } } /******************************** @@ -1519,10 +1522,11 @@ void PGrid::c_unpack_mid(const int nffts, const int nb, double *tmp1, double *tm if (balanced) mybalance->c_unbalance_end(nffts, nb, tmp1, request_indx); - std::memcpy(tmp2, tmp1, 2 * (nida[nb] + nidb2[nb]) * sizeof(double)); - std::memset(tmp1, 0, n2ft3d * sizeof(double)); + std::memcpy(tmp2, tmp1, nffts*2*(nida[nb]+nidb2[nb])*sizeof(double)); + std::memset(tmp1, 0, nffts*n2ft3d*sizeof(double)); - c_bindexcopy((nida[nb] + nidb2[nb]), packarray[nb], tmp2, tmp1); + for (auto s=0; sastart(request_indx,parall->np_i()); // unpack start, tmp1-->tmp1 - std::memcpy(tmp1, a, 2 * (nida[nb] + nidb[nb]) * sizeof(double)); + for (auto s=0; sc_unpack_start(nffts, nb, tmp1, tmp2, request_indx, 47); } else if (step == 1) @@ -1834,32 +1864,35 @@ void PGrid::pfftbz_start(const int nffts, const int nb, double *tmp1, double *tm *** A(kx,nz,ky) <- fft1d^(-1)[A(kx,kz,ky)] *** ***************************************************/ int indx0 = 0; - int indx2 = 0; int nn = 0; - for (auto q=0; qastart(request_indx,parall->np_i()); // unpack start, tmp1-->tmp1 - std::memcpy(tmp1, a, 2 * (nida[nb] + nidb[nb]) * sizeof(double)); + for (auto s=0; sc_unpack_start(nffts, nb, tmp1, tmp2, request_indx, 47); } else if (step == 1) @@ -2419,6 +2486,11 @@ void PGrid::cr_pfft3b_queuein(const int nb, const int nffts_in, double *a) //int nffts_in = 1; int shift1, shift2; int np = parall->np_i(); + //std::cout << "cr queuein HERA" << std::endl; + //std::cout << "cr_queuein nb=" << nb << " nffts_in=" << nffts_in << std::endl; + //std::cout << " aqsize=" << aqsize << " nffts_max=" << nffts_max << std::endl; + //std::cout << " alast_index=" << alast_index << std::endl; + //std::cout << "src ptr:" << a << std::endl; for (auto q=0; qc_balance_start(nffts, nb, a, request_indx, msgtype); - - return; + const int request_indx, const int msgtype) +{ + // int one=1; + + // DCOPY_PWDFT(n2ft3d,a,one,tmp,one); + std::memcpy(tmp1, a, nffts*n2ft3d * sizeof(double)); + std::memset(a, 0, nffts*n2ft3d * sizeof(double)); + + for (auto s=0; sc_balance_start(nffts, nb, a+s*n2ft3d, request_indx, msgtype); + + return; } /******************************** @@ -3216,12 +3546,13 @@ void PGrid::c_pack_start(const int nffts, const int nb, double *a, double *tmp1, * PGrid:c_pack_end * * * ********************************/ -void PGrid::c_pack_end(const int nffts, const int nb, double *tmp1, const int request_indx) { - - if (balanced) - mybalance->c_balance_end(nffts, nb, tmp1, request_indx); +void PGrid::c_pack_end(const int nffts, const int nb, double *tmp1, const int request_indx) +{ + if (balanced) + for (auto s=0; sc_balance_end(nffts, nb, tmp1+s*n2ft3d, request_indx); - return; + return; } /******************************** diff --git a/Nwpw/nwpwlib/utilities/util.hpp b/Nwpw/nwpwlib/utilities/util.hpp index 1557e082..c09de50c 100644 --- a/Nwpw/nwpwlib/utilities/util.hpp +++ b/Nwpw/nwpwlib/utilities/util.hpp @@ -15,8 +15,14 @@ namespace pwdft { extern void c_aindexcopy(const int, const int *, double *, double *); +extern void c_aindexcopy_stride(const int, const int, const int *, double *, double *); + extern void c_bindexcopy(const int, const int *, double *, double *); +extern void c_bindexcopy_stride(const int, const int, const int *, double *, double *); + extern void c_bindexcopy_conjg(const int, const int *, double *, double *); +extern void c_bindexcopy_conjg_stride(const int, const int, const int *, double *, double *); + extern void c_bindexzero(const int, const int *, double *); extern void t_aindexcopy(const int, const int *, double *, double *);