calculate each reflection separately

This is a test to see if we can split the calculation of a sample point into multiple kernel-calls (one per reflection slice). The reason is, that our current code computes all reflection slices in a single huge array. This old style had several disadvantages that could be fixed: - the array `numberOfReflectionSlices` is **huge**. As big as indicesOfPrisms, and so it is part of the bottleneck for the number of rays. - removing this array cuts our memory requirements almost by 50% (for really high ray numbers) - the kernel does not have to do the memory lookup to find the correct reflection_i (all reflections are in the same plane anyway) - the arrays `numberOfReflectionSlices` and `raysPerPrism` are actually linearized 2D arrays that contain all the reflection planes. This leads to more difficult code while we do the `mapRaysToPrisms`. - if there is only one reflection plane at a time, this makes it much easier to split the numbers of rays even further to allow more rays in total (see #2). - the resulting code is more maintainable This is nice and all, but splitting the reflections might introduce some problems: - if there are reflections, we will do a lot more kernel calls and each of those might be quite small. So maybe the GPU is not utilized as much. Previously, everything was done in 1 huge kernel call. - since we don't know how many rays there are in each plane, we have to call thrust::reduce in each iteration. - since we need multiple ray schedules (one for each reflection plane), we also need to call the mapRaysToPrisms in each iteration. All in all, the performance implications need to be tested. I believe that this commit can improve long-term code quality and will directly enable #2. But if the performance suffers, we might need to code some workaround (maybe use the split functionality only for really high ray numbers where the tradeoff is not so bad and we really NEED it).
ComputationalRadiationPhysics · Aug 31, 2015 · 3e780d7 · 3e780d7
1 parent 62ad98c
commit 3e780d7
Show file tree

Hide file tree

Showing 5 changed files with 86 additions and 106 deletions.
diff --git a/include/calc_sample_gain_sum.hpp b/include/calc_sample_gain_sum.hpp
@@ -52,8 +52,8 @@
  * @param indicesOfPrisms a mapping from rays to prisms. The value x on the
  *	      n-th position denotes that ray n is supposed to start propagating 
  *	      from the x-th prism
- * @param numberOfReflectionSlices similar to indicesOfPrisms, but denotes the
- *        reflection slice from which each ray starts (see importance sampling)
+ * @param reflection_i denotes the reflection slice from which each ray starts
+ *        (see importance sampling)
  * @param importance the importance for each prism, obtained through the
  *        importance sampling kernel
  * @param raysPerSample the number of rays to use for the sample point s_i
@@ -71,7 +71,7 @@
 __global__ void calcSampleGainSumWithReflection(curandStateMtgp32* globalState, 
 				 const Mesh mesh,
 				 const unsigned* indicesOfPrisms, 
-				 const unsigned* numberOfReflections,
+				 unsigned reflection_i,
 				 const double* importance,
 				 const unsigned raysPerSample,
 				 float *gainSum,

diff --git a/include/map_rays_to_prisms.hpp b/include/map_rays_to_prisms.hpp
@@ -45,21 +45,12 @@
  *        output numberOfReflections is handled in a similar way
  *
  *
- * @param indicesOfPrisms a pointer to the OUTPUT generated like described in the example
- * @param numberOfReflections a pointer to the OUTPUT similar to indicesOfPrisms
- * @param raysPerPrism the input array
- *                       must be equal to the the length of the prefixSum.
- * @param reflectionSlices the number of reflectionSlices. see numberOfPrisms
- * @param raysPerSample the size of indicesOfPrisms/numberOfReflections. Actually 
- *                      identical to the sum of all values in raysPerPrism
- * @param numberOfPrisms the number of prisms. numberOfPrisms * reflectionSlices
+ * @param indicesOfPrisms a reference to the OUTPUT generated like described in the example
+ * @param raysPerPrismStart iterator to the input array
+ * @param raysPerPrismEnd iterator to the input array
  */
 void mapRaysToPrisms(
-    thrust::device_vector<unsigned> &indicesOfPrisms,
-    thrust::device_vector<unsigned> &numberOfReflections,
-    const thrust::device_vector<unsigned> &raysPerPrism,
-    thrust::device_vector<unsigned> &prefixSum,
-    const unsigned reflectionSlices,
-    const unsigned raysPerSample,
-    const unsigned numberOfPrisms
-    );
+        thrust::device_vector<unsigned> &indicesOfPrisms,
+        const thrust::device_vector<unsigned>::iterator raysPerPrismStart,
+        const thrust::device_vector<unsigned>::iterator raysPerPrismEnd
+        );
diff --git a/src/calc_phi_ase.cu b/src/calc_phi_ase.cu
@@ -112,14 +112,11 @@ float calcPhiAse ( const ExperimentParameters& experiment,
   // }
 
   // Memory allocation/init and copy for device memory
-  device_vector<unsigned> dNumberOfReflectionSlices(experiment.maxRaysPerSample, 0);
   device_vector<float>    dGainSum            (1, 0);
   device_vector<float>    dGainSumSquare      (1, 0);
   device_vector<unsigned> dRaysPerPrism       (mesh.numberOfPrisms * reflectionSlices, 1);
-  device_vector<unsigned> dPrefixSum          (mesh.numberOfPrisms * reflectionSlices, 0);
   device_vector<double>   dImportance         (mesh.numberOfPrisms * reflectionSlices, 0);
   device_vector<double>   dPreImportance      (mesh.numberOfPrisms * reflectionSlices, 0);
-  device_vector<unsigned> dIndicesOfPrisms    (experiment.maxRaysPerSample,  0);
   device_vector<double>   dSigmaA             (experiment.sigmaA.begin(), experiment.sigmaA.end());
   device_vector<double>   dSigmaE             (experiment.sigmaE.begin(),experiment.sigmaE.end());
 
@@ -165,42 +162,53 @@ float calcPhiAse ( const ExperimentParameters& experiment,
 							    blockDim,
 							    gridDim);
 
-	// Prism scheduling for gpu threads
-	mapRaysToPrisms(dIndicesOfPrisms, dNumberOfReflectionSlices, dRaysPerPrism, dPrefixSum, reflectionSlices, hRaysPerSampleDump, mesh.numberOfPrisms);
-
-	// Start Kernel
 	dGainSum[0]       = 0;
 	dGainSumSquare[0] = 0;
 
-	if(experiment.useReflections){
-	  calcSampleGainSumWithReflection<<< gridDim, blockDim >>>(devMTGPStates,
-								   mesh, 
-								   raw_pointer_cast(&dIndicesOfPrisms[0]), 
-								   raw_pointer_cast(&dNumberOfReflectionSlices[0]), 
-								   raw_pointer_cast(&dImportance[0]),
-								   hRaysPerSampleDump, 
-								   raw_pointer_cast(&dGainSum[0]), 
-								   raw_pointer_cast(&dGainSumSquare[0]),
-								   sample_i, 
-								   raw_pointer_cast(&dSigmaA[0]),
-								   raw_pointer_cast(&dSigmaE[0]),
-								   experiment.sigmaA.size(),
-								   raw_pointer_cast(&(device_vector<unsigned> (1,0))[0]));
-	}
-	else{
-	  calcSampleGainSum<<< gridDim, blockDim >>>(devMTGPStates,
-						     mesh, 
-						     raw_pointer_cast(&dIndicesOfPrisms[0]), 
-						     raw_pointer_cast(&dImportance[0]),
-						     hRaysPerSampleDump, 
-						     raw_pointer_cast(&dGainSum[0]), 
-						     raw_pointer_cast(&dGainSumSquare[0]),
-						     sample_i, 
-						     raw_pointer_cast(&dSigmaA[0]),
-						     raw_pointer_cast(&dSigmaE[0]),
-						     experiment.sigmaA.size(),
-						     raw_pointer_cast(&(device_vector<unsigned> (1,0))[0]));
-	}
+    for(unsigned reflection_i=0; reflection_i < reflectionSlices; ++reflection_i){
+        const unsigned reflectionOffset = mesh.numberOfPrisms * reflection_i;
+        device_vector<double>::iterator   reflImportanceBegin   = dImportance.begin()   + reflectionOffset;
+        device_vector<unsigned>::iterator reflRaysPerPrismBegin = dRaysPerPrism.begin() + reflectionOffset;
+        device_vector<unsigned>::iterator reflRaysPerPrismEnd   = reflRaysPerPrismBegin + mesh.numberOfPrisms;
+
+        unsigned hRaysPerSampleIteration =  thrust::reduce(reflRaysPerPrismBegin, reflRaysPerPrismEnd, 0u);
+        device_vector<unsigned> dIndicesOfPrisms(hRaysPerSampleIteration,  0);
+        mapRaysToPrisms(dIndicesOfPrisms, reflRaysPerPrismBegin, reflRaysPerPrismEnd);
+
+        // Start Kernel
+        if(experiment.useReflections){
+            calcSampleGainSumWithReflection<<< gridDim, blockDim >>>(
+                    devMTGPStates,
+                    mesh,
+                    raw_pointer_cast(&dIndicesOfPrisms[0]),
+                    reflection_i,
+                    raw_pointer_cast( &(*reflImportanceBegin) ),
+                    hRaysPerSampleIteration,
+                    raw_pointer_cast(&dGainSum[0]),
+                    raw_pointer_cast(&dGainSumSquare[0]),
+                    sample_i,
+                    raw_pointer_cast(&dSigmaA[0]),
+                    raw_pointer_cast(&dSigmaE[0]),
+                    experiment.sigmaA.size(),
+                    raw_pointer_cast(&(device_vector<unsigned> (1,0))[0]));
+        }
+        else{
+            calcSampleGainSum<<< gridDim, blockDim >>>(
+                    devMTGPStates,
+                    mesh,
+                    raw_pointer_cast(&dIndicesOfPrisms[0]),
+                    raw_pointer_cast( &(*reflImportanceBegin) ),
+                    hRaysPerSampleIteration,
+                    raw_pointer_cast(&dGainSum[0]),
+                    raw_pointer_cast(&dGainSumSquare[0]),
+                    sample_i,
+                    raw_pointer_cast(&dSigmaA[0]),
+                    raw_pointer_cast(&dSigmaE[0]),
+                    experiment.sigmaA.size(),
+                    raw_pointer_cast(&(device_vector<unsigned> (1,0))[0]));
+        }
+    }
+
 
 	float mseTmp = calcMSE(dGainSum[0], dGainSumSquare[0], hRaysPerSampleDump);
 

diff --git a/src/calc_sample_gain_sum.cu b/src/calc_sample_gain_sum.cu
@@ -91,7 +91,7 @@ __device__ __inline__ unsigned genRndSigmas(unsigned length, curandStateMtgp32*
 __global__ void calcSampleGainSumWithReflection(curandStateMtgp32* globalState,
 						const Mesh mesh, 
 						const unsigned* indicesOfPrisms, 
-						const unsigned* numberOfReflectionSlices,
+						const unsigned reflection_i,
 						const double* importance,
 						const unsigned raysPerSample,
 						float *gainSum, 
@@ -117,7 +117,6 @@ __global__ void calcSampleGainSumWithReflection(curandStateMtgp32* globalState,
 
     // Get triangle/prism to start ray from
     unsigned startPrism             = indicesOfPrisms[rayNumber];
-    unsigned reflection_i           = numberOfReflectionSlices[rayNumber];
     unsigned reflections            = (reflection_i + 1) / 2;
     ReflectionPlane reflectionPlane = (reflection_i % 2 == 0) ? BOTTOM_REFLECTION : TOP_REFLECTION;
     unsigned startLevel             = startPrism / mesh.numberOfTriangles;

diff --git a/src/map_rays_to_prisms.cu b/src/map_rays_to_prisms.cu
@@ -26,7 +26,6 @@
 
 
 using thrust::device_vector;
-using thrust::host_vector;
 using thrust::raw_pointer_cast;
 
 /**
@@ -48,67 +47,50 @@ using thrust::raw_pointer_cast;
  *        resulting output arrays:
  *        [0 0 0 2 2 3] (indicesOfPrisms)
  *
- *        output numberOfReflections is handled in a similar way
  *
- *
- * @param numberOfPrisms the number of prisms. numberOfPrisms * reflectionSlices
+ * @param numberOfPrisms the number of prisms.
  *                       must be equal to the the length of the prefixSum.
- * @param raysPerSample the size of indicesOfPrisms/numberOfReflections. Actually 
- *                      identical to the sum of all values in raysPerPrism
- * @param reflectionSlices the number of reflectionSlices. see numberOfPrisms
  * @param raysPerPrism the input array from which prefixSum was generated
  * @param prefixSum the prefixSum generated from raysPerPrism
  * @param indicesOfPrisms a pointer to the OUTPUT generated like described in the example
- * @param numberOfReflections a pointer to the OUTPUT similar to indicesOfPrisms
  */
 __global__ void mapPrefixSumToPrisms(
-    const unsigned numberOfPrisms,
-    const unsigned raysPerSample,
-    const unsigned reflectionSlices,
-    const unsigned* raysPerPrism,
-    const unsigned* prefixSum,
-    unsigned *indicesOfPrisms,
-    unsigned *numberOfReflections
-    ){
+        const unsigned numberOfPrisms,
+        const unsigned* raysPerPrism,
+        const unsigned* prefixSum,
+        unsigned *indicesOfPrisms
+        ){
 
-  int id = threadIdx.x + (blockIdx.x * blockDim.x);
-  // break if we have too many threads (this is likely)
-  if(id >= numberOfPrisms*reflectionSlices) return;
+    int id = threadIdx.x + (blockIdx.x * blockDim.x);
+    // break if we have too many threads (this is likely)
+    if(id >= numberOfPrisms) return;
 
-  const unsigned count            = raysPerPrism[id];
-  const unsigned startingPosition = prefixSum[id];
-  const unsigned reflection_i     = id / numberOfPrisms;
-  const unsigned prism_i          = id % numberOfPrisms;
+    const unsigned count            = raysPerPrism[id];
+    const unsigned startingPosition = prefixSum[id];
+    const unsigned prism_i          = id;
 
-  for(unsigned i=0; i < count ; ++i){
-    indicesOfPrisms[startingPosition + i] = prism_i;     
-    numberOfReflections[startingPosition + i] = reflection_i; 
-  }
+    for(unsigned i=0; i < count ; ++i){
+        indicesOfPrisms[startingPosition + i] = prism_i;
+    }
 }
 
-void mapRaysToPrisms(
-    device_vector<unsigned> &indicesOfPrisms, 
-    device_vector<unsigned> &numberOfReflections,
-    const device_vector<unsigned> &raysPerPrism, 
-    device_vector<unsigned> &prefixSum, 
-    const unsigned reflectionSlices,
-    const unsigned raysPerSample,
-    const unsigned numberOfPrisms
-    ){
 
-  // blocksize chosen by occupancyCalculator
-  const unsigned blocksize = 256;
-  const unsigned gridsize  = (raysPerPrism.size()+blocksize-1)/blocksize;
+void mapRaysToPrisms(
+        device_vector<unsigned> &indicesOfPrisms,
+        const device_vector<unsigned>::iterator raysPerPrismBegin,
+        const device_vector<unsigned>::iterator raysPerPrismEnd
+        ){
+    // blocksize chosen by occupancyCalculator
+    const unsigned blocksize = 256;
+    const unsigned gridsize  = (raysPerPrismEnd-raysPerPrismBegin +blocksize-1)/blocksize;
+    device_vector<unsigned> prefixSum(raysPerPrismEnd-raysPerPrismBegin);
 
-  thrust::exclusive_scan(raysPerPrism.begin(), raysPerPrism.end(),prefixSum.begin());
+    thrust::exclusive_scan(raysPerPrismBegin, raysPerPrismEnd, prefixSum.begin());
 
-  mapPrefixSumToPrisms <<<gridsize,blocksize>>> (
-      numberOfPrisms, 
-      raysPerSample, 
-      reflectionSlices,
-      raw_pointer_cast( &raysPerPrism[0] ),
-      raw_pointer_cast( &prefixSum[0] ), 
-      raw_pointer_cast( &indicesOfPrisms[0] ),
-      raw_pointer_cast( &numberOfReflections[0] )
-      );
+    mapPrefixSumToPrisms<<<gridsize,blocksize>>> (
+            prefixSum.size(),
+            raw_pointer_cast( &(*raysPerPrismBegin) ),
+            raw_pointer_cast( &prefixSum[0] ),
+            raw_pointer_cast( &indicesOfPrisms[0] )
+            );
 }