Skip to content

Commit

Permalink
calculate each reflection separately
Browse files Browse the repository at this point in the history
This is a test to see if we can split the calculation of a sample point
into multiple kernel-calls (one per reflection slice). The reason is,
that our current code computes all reflection slices in a single huge
array. This old style had several disadvantages that could be fixed:

 - the array `numberOfReflectionSlices` is **huge**. As big as
   indicesOfPrisms, and so it is part of the bottleneck for the number
   of rays.
   - removing this array cuts our memory requirements almost by 50% (for
     really high ray numbers)
   - the kernel does not have to do the memory lookup to find the
     correct reflection_i (all reflections are in the same plane anyway)
 - the arrays `numberOfReflectionSlices` and `raysPerPrism` are
   actually linearized 2D arrays that contain all the reflection planes.
   This leads to more difficult code while we do the `mapRaysToPrisms`.
   - if there is only one reflection plane at a time, this makes it much
     easier to split the numbers of rays even further to allow more rays
     in total (see #2).
   - the resulting code is more maintainable

This is nice and all, but splitting the reflections might introduce some
problems:

 - if there are reflections, we will do a lot more kernel calls and each
   of those might be quite small. So maybe the GPU is not utilized as
   much. Previously, everything was done in 1 huge kernel call.
 - since we don't know how many rays there are in each plane, we have to
   call thrust::reduce in each iteration.
 - since we need multiple ray schedules (one for each reflection plane),
   we also need to call the mapRaysToPrisms in each iteration.

All in all, the performance implications need to be tested. I believe
that this commit can improve long-term code quality and will directly
enable #2. But if the performance suffers, we might need to code some
workaround (maybe use the split functionality only for really high ray
numbers where the tradeoff is not so bad and we really NEED it).
  • Loading branch information
slizzered committed Aug 31, 2015
1 parent 62ad98c commit 3e780d7
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 106 deletions.
6 changes: 3 additions & 3 deletions include/calc_sample_gain_sum.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@
* @param indicesOfPrisms a mapping from rays to prisms. The value x on the
* n-th position denotes that ray n is supposed to start propagating
* from the x-th prism
* @param numberOfReflectionSlices similar to indicesOfPrisms, but denotes the
* reflection slice from which each ray starts (see importance sampling)
* @param reflection_i denotes the reflection slice from which each ray starts
* (see importance sampling)
* @param importance the importance for each prism, obtained through the
* importance sampling kernel
* @param raysPerSample the number of rays to use for the sample point s_i
Expand All @@ -71,7 +71,7 @@
__global__ void calcSampleGainSumWithReflection(curandStateMtgp32* globalState,
const Mesh mesh,
const unsigned* indicesOfPrisms,
const unsigned* numberOfReflections,
unsigned reflection_i,
const double* importance,
const unsigned raysPerSample,
float *gainSum,
Expand Down
23 changes: 7 additions & 16 deletions include/map_rays_to_prisms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,21 +45,12 @@
* output numberOfReflections is handled in a similar way
*
*
* @param indicesOfPrisms a pointer to the OUTPUT generated like described in the example
* @param numberOfReflections a pointer to the OUTPUT similar to indicesOfPrisms
* @param raysPerPrism the input array
* must be equal to the the length of the prefixSum.
* @param reflectionSlices the number of reflectionSlices. see numberOfPrisms
* @param raysPerSample the size of indicesOfPrisms/numberOfReflections. Actually
* identical to the sum of all values in raysPerPrism
* @param numberOfPrisms the number of prisms. numberOfPrisms * reflectionSlices
* @param indicesOfPrisms a reference to the OUTPUT generated like described in the example
* @param raysPerPrismStart iterator to the input array
* @param raysPerPrismEnd iterator to the input array
*/
void mapRaysToPrisms(
thrust::device_vector<unsigned> &indicesOfPrisms,
thrust::device_vector<unsigned> &numberOfReflections,
const thrust::device_vector<unsigned> &raysPerPrism,
thrust::device_vector<unsigned> &prefixSum,
const unsigned reflectionSlices,
const unsigned raysPerSample,
const unsigned numberOfPrisms
);
thrust::device_vector<unsigned> &indicesOfPrisms,
const thrust::device_vector<unsigned>::iterator raysPerPrismStart,
const thrust::device_vector<unsigned>::iterator raysPerPrismEnd
);
80 changes: 44 additions & 36 deletions src/calc_phi_ase.cu
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,11 @@ float calcPhiAse ( const ExperimentParameters& experiment,
// }

// Memory allocation/init and copy for device memory
device_vector<unsigned> dNumberOfReflectionSlices(experiment.maxRaysPerSample, 0);
device_vector<float> dGainSum (1, 0);
device_vector<float> dGainSumSquare (1, 0);
device_vector<unsigned> dRaysPerPrism (mesh.numberOfPrisms * reflectionSlices, 1);
device_vector<unsigned> dPrefixSum (mesh.numberOfPrisms * reflectionSlices, 0);
device_vector<double> dImportance (mesh.numberOfPrisms * reflectionSlices, 0);
device_vector<double> dPreImportance (mesh.numberOfPrisms * reflectionSlices, 0);
device_vector<unsigned> dIndicesOfPrisms (experiment.maxRaysPerSample, 0);
device_vector<double> dSigmaA (experiment.sigmaA.begin(), experiment.sigmaA.end());
device_vector<double> dSigmaE (experiment.sigmaE.begin(),experiment.sigmaE.end());

Expand Down Expand Up @@ -165,42 +162,53 @@ float calcPhiAse ( const ExperimentParameters& experiment,
blockDim,
gridDim);

// Prism scheduling for gpu threads
mapRaysToPrisms(dIndicesOfPrisms, dNumberOfReflectionSlices, dRaysPerPrism, dPrefixSum, reflectionSlices, hRaysPerSampleDump, mesh.numberOfPrisms);

// Start Kernel
dGainSum[0] = 0;
dGainSumSquare[0] = 0;

if(experiment.useReflections){
calcSampleGainSumWithReflection<<< gridDim, blockDim >>>(devMTGPStates,
mesh,
raw_pointer_cast(&dIndicesOfPrisms[0]),
raw_pointer_cast(&dNumberOfReflectionSlices[0]),
raw_pointer_cast(&dImportance[0]),
hRaysPerSampleDump,
raw_pointer_cast(&dGainSum[0]),
raw_pointer_cast(&dGainSumSquare[0]),
sample_i,
raw_pointer_cast(&dSigmaA[0]),
raw_pointer_cast(&dSigmaE[0]),
experiment.sigmaA.size(),
raw_pointer_cast(&(device_vector<unsigned> (1,0))[0]));
}
else{
calcSampleGainSum<<< gridDim, blockDim >>>(devMTGPStates,
mesh,
raw_pointer_cast(&dIndicesOfPrisms[0]),
raw_pointer_cast(&dImportance[0]),
hRaysPerSampleDump,
raw_pointer_cast(&dGainSum[0]),
raw_pointer_cast(&dGainSumSquare[0]),
sample_i,
raw_pointer_cast(&dSigmaA[0]),
raw_pointer_cast(&dSigmaE[0]),
experiment.sigmaA.size(),
raw_pointer_cast(&(device_vector<unsigned> (1,0))[0]));
}
for(unsigned reflection_i=0; reflection_i < reflectionSlices; ++reflection_i){
const unsigned reflectionOffset = mesh.numberOfPrisms * reflection_i;
device_vector<double>::iterator reflImportanceBegin = dImportance.begin() + reflectionOffset;
device_vector<unsigned>::iterator reflRaysPerPrismBegin = dRaysPerPrism.begin() + reflectionOffset;
device_vector<unsigned>::iterator reflRaysPerPrismEnd = reflRaysPerPrismBegin + mesh.numberOfPrisms;

unsigned hRaysPerSampleIteration = thrust::reduce(reflRaysPerPrismBegin, reflRaysPerPrismEnd, 0u);
device_vector<unsigned> dIndicesOfPrisms(hRaysPerSampleIteration, 0);
mapRaysToPrisms(dIndicesOfPrisms, reflRaysPerPrismBegin, reflRaysPerPrismEnd);

// Start Kernel
if(experiment.useReflections){
calcSampleGainSumWithReflection<<< gridDim, blockDim >>>(
devMTGPStates,
mesh,
raw_pointer_cast(&dIndicesOfPrisms[0]),
reflection_i,
raw_pointer_cast( &(*reflImportanceBegin) ),
hRaysPerSampleIteration,
raw_pointer_cast(&dGainSum[0]),
raw_pointer_cast(&dGainSumSquare[0]),
sample_i,
raw_pointer_cast(&dSigmaA[0]),
raw_pointer_cast(&dSigmaE[0]),
experiment.sigmaA.size(),
raw_pointer_cast(&(device_vector<unsigned> (1,0))[0]));
}
else{
calcSampleGainSum<<< gridDim, blockDim >>>(
devMTGPStates,
mesh,
raw_pointer_cast(&dIndicesOfPrisms[0]),
raw_pointer_cast( &(*reflImportanceBegin) ),
hRaysPerSampleIteration,
raw_pointer_cast(&dGainSum[0]),
raw_pointer_cast(&dGainSumSquare[0]),
sample_i,
raw_pointer_cast(&dSigmaA[0]),
raw_pointer_cast(&dSigmaE[0]),
experiment.sigmaA.size(),
raw_pointer_cast(&(device_vector<unsigned> (1,0))[0]));
}
}


float mseTmp = calcMSE(dGainSum[0], dGainSumSquare[0], hRaysPerSampleDump);

Expand Down
3 changes: 1 addition & 2 deletions src/calc_sample_gain_sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ __device__ __inline__ unsigned genRndSigmas(unsigned length, curandStateMtgp32*
__global__ void calcSampleGainSumWithReflection(curandStateMtgp32* globalState,
const Mesh mesh,
const unsigned* indicesOfPrisms,
const unsigned* numberOfReflectionSlices,
const unsigned reflection_i,
const double* importance,
const unsigned raysPerSample,
float *gainSum,
Expand All @@ -117,7 +117,6 @@ __global__ void calcSampleGainSumWithReflection(curandStateMtgp32* globalState,

// Get triangle/prism to start ray from
unsigned startPrism = indicesOfPrisms[rayNumber];
unsigned reflection_i = numberOfReflectionSlices[rayNumber];
unsigned reflections = (reflection_i + 1) / 2;
ReflectionPlane reflectionPlane = (reflection_i % 2 == 0) ? BOTTOM_REFLECTION : TOP_REFLECTION;
unsigned startLevel = startPrism / mesh.numberOfTriangles;
Expand Down
80 changes: 31 additions & 49 deletions src/map_rays_to_prisms.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@


using thrust::device_vector;
using thrust::host_vector;
using thrust::raw_pointer_cast;

/**
Expand All @@ -48,67 +47,50 @@ using thrust::raw_pointer_cast;
* resulting output arrays:
* [0 0 0 2 2 3] (indicesOfPrisms)
*
* output numberOfReflections is handled in a similar way
*
*
* @param numberOfPrisms the number of prisms. numberOfPrisms * reflectionSlices
* @param numberOfPrisms the number of prisms.
* must be equal to the the length of the prefixSum.
* @param raysPerSample the size of indicesOfPrisms/numberOfReflections. Actually
* identical to the sum of all values in raysPerPrism
* @param reflectionSlices the number of reflectionSlices. see numberOfPrisms
* @param raysPerPrism the input array from which prefixSum was generated
* @param prefixSum the prefixSum generated from raysPerPrism
* @param indicesOfPrisms a pointer to the OUTPUT generated like described in the example
* @param numberOfReflections a pointer to the OUTPUT similar to indicesOfPrisms
*/
__global__ void mapPrefixSumToPrisms(
const unsigned numberOfPrisms,
const unsigned raysPerSample,
const unsigned reflectionSlices,
const unsigned* raysPerPrism,
const unsigned* prefixSum,
unsigned *indicesOfPrisms,
unsigned *numberOfReflections
){
const unsigned numberOfPrisms,
const unsigned* raysPerPrism,
const unsigned* prefixSum,
unsigned *indicesOfPrisms
){

int id = threadIdx.x + (blockIdx.x * blockDim.x);
// break if we have too many threads (this is likely)
if(id >= numberOfPrisms*reflectionSlices) return;
int id = threadIdx.x + (blockIdx.x * blockDim.x);
// break if we have too many threads (this is likely)
if(id >= numberOfPrisms) return;

const unsigned count = raysPerPrism[id];
const unsigned startingPosition = prefixSum[id];
const unsigned reflection_i = id / numberOfPrisms;
const unsigned prism_i = id % numberOfPrisms;
const unsigned count = raysPerPrism[id];
const unsigned startingPosition = prefixSum[id];
const unsigned prism_i = id;

for(unsigned i=0; i < count ; ++i){
indicesOfPrisms[startingPosition + i] = prism_i;
numberOfReflections[startingPosition + i] = reflection_i;
}
for(unsigned i=0; i < count ; ++i){
indicesOfPrisms[startingPosition + i] = prism_i;
}
}

void mapRaysToPrisms(
device_vector<unsigned> &indicesOfPrisms,
device_vector<unsigned> &numberOfReflections,
const device_vector<unsigned> &raysPerPrism,
device_vector<unsigned> &prefixSum,
const unsigned reflectionSlices,
const unsigned raysPerSample,
const unsigned numberOfPrisms
){

// blocksize chosen by occupancyCalculator
const unsigned blocksize = 256;
const unsigned gridsize = (raysPerPrism.size()+blocksize-1)/blocksize;
void mapRaysToPrisms(
device_vector<unsigned> &indicesOfPrisms,
const device_vector<unsigned>::iterator raysPerPrismBegin,
const device_vector<unsigned>::iterator raysPerPrismEnd
){
// blocksize chosen by occupancyCalculator
const unsigned blocksize = 256;
const unsigned gridsize = (raysPerPrismEnd-raysPerPrismBegin +blocksize-1)/blocksize;
device_vector<unsigned> prefixSum(raysPerPrismEnd-raysPerPrismBegin);

thrust::exclusive_scan(raysPerPrism.begin(), raysPerPrism.end(),prefixSum.begin());
thrust::exclusive_scan(raysPerPrismBegin, raysPerPrismEnd, prefixSum.begin());

mapPrefixSumToPrisms <<<gridsize,blocksize>>> (
numberOfPrisms,
raysPerSample,
reflectionSlices,
raw_pointer_cast( &raysPerPrism[0] ),
raw_pointer_cast( &prefixSum[0] ),
raw_pointer_cast( &indicesOfPrisms[0] ),
raw_pointer_cast( &numberOfReflections[0] )
);
mapPrefixSumToPrisms<<<gridsize,blocksize>>> (
prefixSum.size(),
raw_pointer_cast( &(*raysPerPrismBegin) ),
raw_pointer_cast( &prefixSum[0] ),
raw_pointer_cast( &indicesOfPrisms[0] )
);
}

0 comments on commit 3e780d7

Please sign in to comment.