From 999b65204af1c774dd8b962d5f140d1c2c7313b6 Mon Sep 17 00:00:00 2001 From: Carlchristian Eckert Date: Thu, 9 Jul 2015 11:18:38 +0200 Subject: [PATCH] Refactored to reduce workload - Re-use all vectors as much as possible - one device-vector is created inside the loop (which seems wasteful at first), to enable an easy transition to CUDA streams in the future :) --- include/map_rays_to_prisms.hpp | 7 +++++-- src/calc_phi_ase.cu | 30 +++++++++++++++++++++++++----- src/calc_sample_gain_sum.cu | 5 ++--- src/map_rays_to_prisms.cu | 17 +++++++++++------ 4 files changed, 43 insertions(+), 16 deletions(-) diff --git a/include/map_rays_to_prisms.hpp b/include/map_rays_to_prisms.hpp index 79ae814..c44ec40 100644 --- a/include/map_rays_to_prisms.hpp +++ b/include/map_rays_to_prisms.hpp @@ -51,6 +51,9 @@ */ void mapRaysToPrisms( thrust::device_vector &indicesOfPrisms, - const thrust::device_vector::iterator raysPerPrismStart, - const thrust::device_vector::iterator raysPerPrismEnd + const thrust::device_vector::iterator raysPerPrismBegin, + const thrust::device_vector::iterator raysPerPrismEnd, + const thrust::device_vector::iterator prefixSumBegin, + const thrust::device_vector::iterator prefixSumEnd, + const unsigned offset ); diff --git a/src/calc_phi_ase.cu b/src/calc_phi_ase.cu index e61aadf..d0f9f92 100644 --- a/src/calc_phi_ase.cu +++ b/src/calc_phi_ase.cu @@ -162,18 +162,38 @@ float calcPhiAse ( const ExperimentParameters& experiment, blockDim, gridDim); - dGainSum[0] = 0; - dGainSumSquare[0] = 0; + // prepare the Prefix Sum and determine how many rays will be started in each slice + device_vector dPrefixSumComplete(dRaysPerPrism.size()); + thrust::exclusive_scan(dRaysPerPrism.begin(),dRaysPerPrism.end(),dPrefixSumComplete.begin()); + std::vector hRaysPerIterationV(reflectionSlices); + std::vector hRaysPerIterationOffsetV(reflectionSlices); + + for(unsigned i = reflectionSlices, sum = hRaysPerSampleDump; i > 0; ){ + --i; + const unsigned offset = i * mesh.numberOfPrisms; + hRaysPerIterationOffsetV[i] = dPrefixSumComplete[offset]; + hRaysPerIterationV[i] = sum-dPrefixSumComplete[offset]; + sum -= hRaysPerIterationV[i]; + } + //device_vector dIndicesOfPrisms( *(std::max_element(hRaysPerIterationV.begin(), hRaysPerIterationV.end())) ); + + dGainSum[0] = 0; + dGainSumSquare[0] = 0; for(unsigned reflection_i=0; reflection_i < reflectionSlices; ++reflection_i){ + unsigned hRaysPerSampleIteration = hRaysPerIterationV[reflection_i]; + if(hRaysPerSampleIteration == 0) continue; + const unsigned reflectionOffset = mesh.numberOfPrisms * reflection_i; device_vector::iterator reflImportanceBegin = dImportance.begin() + reflectionOffset; device_vector::iterator reflRaysPerPrismBegin = dRaysPerPrism.begin() + reflectionOffset; device_vector::iterator reflRaysPerPrismEnd = reflRaysPerPrismBegin + mesh.numberOfPrisms; + device_vector::iterator reflPrefixSumBegin = dPrefixSumComplete.begin() + reflectionOffset; + device_vector::iterator reflPrefixSumEnd = reflPrefixSumBegin + mesh.numberOfPrisms; + + device_vector dIndicesOfPrisms(hRaysPerSampleIteration); - unsigned hRaysPerSampleIteration = thrust::reduce(reflRaysPerPrismBegin, reflRaysPerPrismEnd, 0u); - device_vector dIndicesOfPrisms(hRaysPerSampleIteration, 0); - mapRaysToPrisms(dIndicesOfPrisms, reflRaysPerPrismBegin, reflRaysPerPrismEnd); + mapRaysToPrisms(dIndicesOfPrisms, reflRaysPerPrismBegin, reflRaysPerPrismEnd, reflPrefixSumBegin, reflPrefixSumEnd, hRaysPerIterationOffsetV[reflection_i]); // Start Kernel if(experiment.useReflections){ diff --git a/src/calc_sample_gain_sum.cu b/src/calc_sample_gain_sum.cu index 11309de..7735940 100644 --- a/src/calc_sample_gain_sum.cu +++ b/src/calc_sample_gain_sum.cu @@ -121,7 +121,6 @@ __global__ void calcSampleGainSumWithReflection(curandStateMtgp32* globalState, ReflectionPlane reflectionPlane = (reflection_i % 2 == 0) ? BOTTOM_REFLECTION : TOP_REFLECTION; unsigned startLevel = startPrism / mesh.numberOfTriangles; unsigned startTriangle = startPrism - (mesh.numberOfTriangles * startLevel); - unsigned reflectionOffset = reflection_i * mesh.numberOfPrisms; Point startPoint = mesh.genRndPoint(startTriangle, startLevel, globalState); //get a random index in the wavelength array @@ -131,10 +130,10 @@ __global__ void calcSampleGainSumWithReflection(curandStateMtgp32* globalState, double gain = propagateRayWithReflection(startPoint, samplePoint, reflections, reflectionPlane, startLevel, startTriangle, mesh, sigmaA[sigma_i], sigmaE[sigma_i]); // include the stimulus from the starting prism and the importance of that ray - gain *= mesh.getBetaVolume(startPrism) * importance[startPrism + reflectionOffset]; + gain *= mesh.getBetaVolume(startPrism) * importance[startPrism]; assert(!isnan(mesh.getBetaVolume(startPrism))); - assert(!isnan(importance[startPrism + reflectionOffset])); + assert(!isnan(importance[startPrism])); assert(!isnan(gain)); gainSumTemp += gain; diff --git a/src/map_rays_to_prisms.cu b/src/map_rays_to_prisms.cu index dcbd686..19cefba 100644 --- a/src/map_rays_to_prisms.cu +++ b/src/map_rays_to_prisms.cu @@ -58,6 +58,7 @@ __global__ void mapPrefixSumToPrisms( const unsigned numberOfPrisms, const unsigned* raysPerPrism, const unsigned* prefixSum, + const unsigned offset, unsigned *indicesOfPrisms ){ @@ -66,7 +67,7 @@ __global__ void mapPrefixSumToPrisms( if(id >= numberOfPrisms) return; const unsigned count = raysPerPrism[id]; - const unsigned startingPosition = prefixSum[id]; + const unsigned startingPosition = prefixSum[id]-offset; const unsigned prism_i = id; for(unsigned i=0; i < count ; ++i){ @@ -78,19 +79,23 @@ __global__ void mapPrefixSumToPrisms( void mapRaysToPrisms( device_vector &indicesOfPrisms, const device_vector::iterator raysPerPrismBegin, - const device_vector::iterator raysPerPrismEnd + const device_vector::iterator raysPerPrismEnd, + const device_vector::iterator prefixSumBegin, + const device_vector::iterator prefixSumEnd, + const unsigned offset ){ // blocksize chosen by occupancyCalculator const unsigned blocksize = 256; const unsigned gridsize = (raysPerPrismEnd-raysPerPrismBegin +blocksize-1)/blocksize; - device_vector prefixSum(raysPerPrismEnd-raysPerPrismBegin); + //device_vector prefixSum(raysPerPrismEnd-raysPerPrismBegin); - thrust::exclusive_scan(raysPerPrismBegin, raysPerPrismEnd, prefixSum.begin()); + //thrust::exclusive_scan(raysPerPrismBegin, raysPerPrismEnd, prefixSum.begin()); mapPrefixSumToPrisms<<>> ( - prefixSum.size(), + prefixSumEnd - prefixSumBegin, raw_pointer_cast( &(*raysPerPrismBegin) ), - raw_pointer_cast( &prefixSum[0] ), + raw_pointer_cast( &(*prefixSumBegin) ), + offset, raw_pointer_cast( &indicesOfPrisms[0] ) ); }