Skip to content

Commit

Permalink
Refactored to reduce workload
Browse files Browse the repository at this point in the history
 - Re-use all vectors as much as possible
 - one device-vector is created inside the loop (which seems wasteful at
   first), to enable an easy transition to CUDA streams in the future :)
  • Loading branch information
slizzered committed Aug 31, 2015
1 parent 3e780d7 commit 999b652
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 16 deletions.
7 changes: 5 additions & 2 deletions include/map_rays_to_prisms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@
*/
void mapRaysToPrisms(
thrust::device_vector<unsigned> &indicesOfPrisms,
const thrust::device_vector<unsigned>::iterator raysPerPrismStart,
const thrust::device_vector<unsigned>::iterator raysPerPrismEnd
const thrust::device_vector<unsigned>::iterator raysPerPrismBegin,
const thrust::device_vector<unsigned>::iterator raysPerPrismEnd,
const thrust::device_vector<unsigned>::iterator prefixSumBegin,
const thrust::device_vector<unsigned>::iterator prefixSumEnd,
const unsigned offset
);
30 changes: 25 additions & 5 deletions src/calc_phi_ase.cu
Original file line number Diff line number Diff line change
Expand Up @@ -162,18 +162,38 @@ float calcPhiAse ( const ExperimentParameters& experiment,
blockDim,
gridDim);

dGainSum[0] = 0;
dGainSumSquare[0] = 0;
// prepare the Prefix Sum and determine how many rays will be started in each slice
device_vector<unsigned> dPrefixSumComplete(dRaysPerPrism.size());
thrust::exclusive_scan(dRaysPerPrism.begin(),dRaysPerPrism.end(),dPrefixSumComplete.begin());
std::vector<unsigned> hRaysPerIterationV(reflectionSlices);
std::vector<unsigned> hRaysPerIterationOffsetV(reflectionSlices);

for(unsigned i = reflectionSlices, sum = hRaysPerSampleDump; i > 0; ){
--i;
const unsigned offset = i * mesh.numberOfPrisms;
hRaysPerIterationOffsetV[i] = dPrefixSumComplete[offset];
hRaysPerIterationV[i] = sum-dPrefixSumComplete[offset];
sum -= hRaysPerIterationV[i];
}
//device_vector<unsigned> dIndicesOfPrisms( *(std::max_element(hRaysPerIterationV.begin(), hRaysPerIterationV.end())) );

dGainSum[0] = 0;
dGainSumSquare[0] = 0;

for(unsigned reflection_i=0; reflection_i < reflectionSlices; ++reflection_i){
unsigned hRaysPerSampleIteration = hRaysPerIterationV[reflection_i];
if(hRaysPerSampleIteration == 0) continue;

const unsigned reflectionOffset = mesh.numberOfPrisms * reflection_i;
device_vector<double>::iterator reflImportanceBegin = dImportance.begin() + reflectionOffset;
device_vector<unsigned>::iterator reflRaysPerPrismBegin = dRaysPerPrism.begin() + reflectionOffset;
device_vector<unsigned>::iterator reflRaysPerPrismEnd = reflRaysPerPrismBegin + mesh.numberOfPrisms;
device_vector<unsigned>::iterator reflPrefixSumBegin = dPrefixSumComplete.begin() + reflectionOffset;
device_vector<unsigned>::iterator reflPrefixSumEnd = reflPrefixSumBegin + mesh.numberOfPrisms;

device_vector<unsigned> dIndicesOfPrisms(hRaysPerSampleIteration);

unsigned hRaysPerSampleIteration = thrust::reduce(reflRaysPerPrismBegin, reflRaysPerPrismEnd, 0u);
device_vector<unsigned> dIndicesOfPrisms(hRaysPerSampleIteration, 0);
mapRaysToPrisms(dIndicesOfPrisms, reflRaysPerPrismBegin, reflRaysPerPrismEnd);
mapRaysToPrisms(dIndicesOfPrisms, reflRaysPerPrismBegin, reflRaysPerPrismEnd, reflPrefixSumBegin, reflPrefixSumEnd, hRaysPerIterationOffsetV[reflection_i]);

// Start Kernel
if(experiment.useReflections){
Expand Down
5 changes: 2 additions & 3 deletions src/calc_sample_gain_sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ __global__ void calcSampleGainSumWithReflection(curandStateMtgp32* globalState,
ReflectionPlane reflectionPlane = (reflection_i % 2 == 0) ? BOTTOM_REFLECTION : TOP_REFLECTION;
unsigned startLevel = startPrism / mesh.numberOfTriangles;
unsigned startTriangle = startPrism - (mesh.numberOfTriangles * startLevel);
unsigned reflectionOffset = reflection_i * mesh.numberOfPrisms;
Point startPoint = mesh.genRndPoint(startTriangle, startLevel, globalState);

//get a random index in the wavelength array
Expand All @@ -131,10 +130,10 @@ __global__ void calcSampleGainSumWithReflection(curandStateMtgp32* globalState,
double gain = propagateRayWithReflection(startPoint, samplePoint, reflections, reflectionPlane, startLevel, startTriangle, mesh, sigmaA[sigma_i], sigmaE[sigma_i]);

// include the stimulus from the starting prism and the importance of that ray
gain *= mesh.getBetaVolume(startPrism) * importance[startPrism + reflectionOffset];
gain *= mesh.getBetaVolume(startPrism) * importance[startPrism];

assert(!isnan(mesh.getBetaVolume(startPrism)));
assert(!isnan(importance[startPrism + reflectionOffset]));
assert(!isnan(importance[startPrism]));
assert(!isnan(gain));

gainSumTemp += gain;
Expand Down
17 changes: 11 additions & 6 deletions src/map_rays_to_prisms.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ __global__ void mapPrefixSumToPrisms(
const unsigned numberOfPrisms,
const unsigned* raysPerPrism,
const unsigned* prefixSum,
const unsigned offset,
unsigned *indicesOfPrisms
){

Expand All @@ -66,7 +67,7 @@ __global__ void mapPrefixSumToPrisms(
if(id >= numberOfPrisms) return;

const unsigned count = raysPerPrism[id];
const unsigned startingPosition = prefixSum[id];
const unsigned startingPosition = prefixSum[id]-offset;
const unsigned prism_i = id;

for(unsigned i=0; i < count ; ++i){
Expand All @@ -78,19 +79,23 @@ __global__ void mapPrefixSumToPrisms(
void mapRaysToPrisms(
device_vector<unsigned> &indicesOfPrisms,
const device_vector<unsigned>::iterator raysPerPrismBegin,
const device_vector<unsigned>::iterator raysPerPrismEnd
const device_vector<unsigned>::iterator raysPerPrismEnd,
const device_vector<unsigned>::iterator prefixSumBegin,
const device_vector<unsigned>::iterator prefixSumEnd,
const unsigned offset
){
// blocksize chosen by occupancyCalculator
const unsigned blocksize = 256;
const unsigned gridsize = (raysPerPrismEnd-raysPerPrismBegin +blocksize-1)/blocksize;
device_vector<unsigned> prefixSum(raysPerPrismEnd-raysPerPrismBegin);
//device_vector<unsigned> prefixSum(raysPerPrismEnd-raysPerPrismBegin);

thrust::exclusive_scan(raysPerPrismBegin, raysPerPrismEnd, prefixSum.begin());
//thrust::exclusive_scan(raysPerPrismBegin, raysPerPrismEnd, prefixSum.begin());

mapPrefixSumToPrisms<<<gridsize,blocksize>>> (
prefixSum.size(),
prefixSumEnd - prefixSumBegin,
raw_pointer_cast( &(*raysPerPrismBegin) ),
raw_pointer_cast( &prefixSum[0] ),
raw_pointer_cast( &(*prefixSumBegin) ),
offset,
raw_pointer_cast( &indicesOfPrisms[0] )
);
}

0 comments on commit 999b652

Please sign in to comment.