diff --git a/python/proj/wcs.py b/python/proj/wcs.py index ee1d3b59..e9023b8b 100644 --- a/python/proj/wcs.py +++ b/python/proj/wcs.py @@ -90,6 +90,13 @@ class Projectionist: operations. Such objects should satisfy the condition that threads[x,j]*threads[y,j] is the empty Range for x != y; i.e. each detector-sample is assigned to at most one thread. + * ``interpol``: How positions that fall between pixel centers will + be handled. Options are "nearest" (default): Use Nearest + Neighbor interpolation, so a sample takes the value of + whatever pixel is closest; or "bilinear": linearly + interpolate between the four closest pixels. bilinear is + slower (around 60%) but avoids problems caused by a + discontinuous model. Attributes: naxis: 2-element integer array specifying the map shape (for @@ -146,13 +153,12 @@ def tiling(self): return _Tiling(self.naxis[::-1], self.tile_shape) @classmethod - def for_geom(cls, shape, wcs): + def for_geom(cls, shape, wcs, interpol=None): """Construct a Projectionist for use with the specified "geometry". The shape and wcs are the core information required to prepare a Projectionist, so this method is likely to be called by other constructors. - """ self = cls() ax1, ax2 = wcs.wcs.lng, wcs.wcs.lat @@ -177,10 +183,14 @@ def for_geom(cls, shape, wcs): self.cdelt = np.array(wcs.wcs.cdelt) * quat.DEG self.crpix = np.array(wcs.wcs.crpix) + # Pixel interpolation mode + if interpol is None: interpol = "nearest" + self.interpol = interpol + return self @classmethod - def for_map(cls, emap, wcs=None): + def for_map(cls, emap, wcs=None, interpol=None): """Construct a Projectionist for use with maps having the same geometry as the provided enmap. @@ -190,7 +200,6 @@ def for_map(cls, emap, wcs=None): with shape attribute), provided that wcs is provided separately. wcs: optional WCS object to use instead of emap.wcs. - """ if wcs is None: wcs = emap.wcs @@ -214,7 +223,7 @@ def for_source_at(cls, alpha0, delta0, gamma0=0., return self @classmethod - def for_tiled(cls, shape, wcs, tile_shape, active_tiles=True): + def for_tiled(cls, shape, wcs, tile_shape, active_tiles=True, interpol=None): """Construct a Projectionist for use with the specified geometry (shape, wcs), cut into tiles of shape tile_shape. @@ -232,7 +241,7 @@ def for_tiled(cls, shape, wcs, tile_shape, active_tiles=True): populate on such calls. """ - self = cls.for_geom(shape, wcs) + self = cls.for_geom(shape, wcs, interpol=interpol) self.tile_shape = np.array(tile_shape, 'int') if active_tiles is True: self.active_tiles = list(range(self.tiling.tile_count)) @@ -260,15 +269,19 @@ def _get_pixelizor_args(self): return args def get_ProjEng(self, comps='TQU', proj_name=None, get=True, - instance=True): + instance=True, interpol=None): """Returns an so3g.ProjEng object appropriate for use with the configured geometry. """ - if proj_name is None: - proj_name = self.proj_name + if proj_name is None: proj_name = self.proj_name tile_suffix = 'Tiled' if self.tiling else 'NonTiled' - projeng_name = f'ProjEng_{proj_name}_{comps}_{tile_suffix}' + # Interpolation mode + if interpol is None: interpol = self.interpol + if interpol in ["nn", "nearest"]: interpol_suffix = "" + elif interpol in ["lin", "bilinear"]: interpol_suffix = "_Bilinear" + else: raise ValueError("ProjEng interpol '%s' not recognized" % str(interpol)) + projeng_name = f'ProjEng_{proj_name}_{comps}_{tile_suffix}{interpol_suffix}' if not get: return projeng_name try: @@ -499,7 +512,7 @@ def assign_threads(self, assembly, method='domdir', n_threads=None): projeng = self.get_ProjEng('T') q_native = self._cache_q_fp_to_native(assembly.Q) omp_ivals = projeng.pixel_ranges(q_native, assembly.dets, None, n_threads) - return RangesMatrix([RangesMatrix(x) for x in omp_ivals]) + return wrap_ivals(omp_ivals) elif method == 'domdir': offs_rep = assembly.dets[::100] @@ -539,7 +552,7 @@ def assign_threads_from_map(self, assembly, tmap, n_threads=None): q_native = self._cache_q_fp_to_native(assembly.Q) n_threads = mapthreads.get_num_threads(n_threads) omp_ivals = projeng.pixel_ranges(q_native, assembly.dets, tmap, n_threads) - return RangesMatrix([RangesMatrix(x) for x in omp_ivals]) + return wrap_ivals(omp_ivals) def get_active_tiles(self, assembly, assign=False): """For a tiled Projection, figure out what tiles are hit by an @@ -592,7 +605,7 @@ def get_active_tiles(self, assembly, assign=False): group_tiles[idx].append(_t) # Now paint them into Ranges. R = projeng.tile_ranges(q_native, assembly.dets, group_tiles) - R = RangesMatrix([RangesMatrix(r) for r in R]) + R = wrap_ivals(R) return { 'active_tiles': list(tiles), 'hit_counts': list(hits), @@ -604,6 +617,8 @@ def get_active_tiles(self, assembly, assign=False): 'hit_counts': list(hits), } + _ivals_format = 2 + class _Tiling: """Utility class for computations involving tiled maps. @@ -629,6 +644,8 @@ def tile_offset(self, tile): row, col = self.tile_rowcol(tile) return row * self.tile_shape[0], col * self.tile_shape[1] +def wrap_ivals(ivals): + return RangesMatrix([RangesMatrix([RangesMatrix(y) for y in x]) for x in ivals]) THREAD_ASSIGNMENT_METHODS = [ 'simple', diff --git a/src/Projection.cxx b/src/Projection.cxx index 556aad1b..f83b771b 100644 --- a/src/Projection.cxx +++ b/src/Projection.cxx @@ -84,6 +84,42 @@ inline bool isNone(const bp::object &pyo) // quite easily; the most obvious candidate being spin-1, to carry // information such as in-plane pointing displacements. +// Bilinear mapmaking (and interpolated mapmaking in general) +// +// Originally only nearest-neighbor mapmaking was supported. Here, +// a sample takes the value of the nearest pixel, regardless of where +// inside this pixel it hits. This means each sample only needs to +// care about a single pixel, which is computed by Pixelizor's GetPixel. +// This simple relationship makes it elatively easy to avoid +// clobbering when using threads in the tod2map operation, as one could +// assign responsibility for different areas of pixels to different threads, +// and translate this directly into sample ranges for those threads. +// +// In interpolated mapmaking like bilinear mapmaking, the value of a sample +// is given by some linear combination of several of the nearest pixels. +// For bilinear mapmaking, these are the four pixels above/below and left/right +// of the sample, which are each weighted based on how close each pixel is to +// the sample. The pixels and corresponding weights are calculated using +// Pixelizor's GetPixels function, which in turn uses the Pixelizor's Interpol +// template argument to determine which type of interpolation should be used +// (including the "no interpolation" nearest neighbor case). +// +// However, since each sample now hits multiple pixels, it is more difficult +// build a set of sample ranges that touch disjoint sets of pixels and so can +// be iterated over in parallel without locking in the tod2map operation. +// There will always be some samples that hit the edge of each thread's pixel +// groups, resulting in it affecting multiple threads' pixels. We handle this +// by generalizing the sample ranges from ranges = vector>, which +// is [nthread][ndet][nrange] in shape, to ranges = vector>> +// which is [nbunch][nthread][ndet][nrange], and where we iterate over the +// outermost "bunch" level in series, and then in parallel over the next level. +// This general framework should work for all parallelization schemes, but what +// we've implemented so far is a simple case where all pixels that unambiguously +// fall inside a single threads' pixel domain are done as before, and then thew few samples +// that fall in multiple domains are done by a single thread in the end. This +// means that ranges[0] has shape [nthread][ndet][nrange] while ranges[1] has +// shape [1][ndet][nrange]. More complicated schemes could be used, but this +// seems to work well enough. // State the template options class ProjQuat; @@ -98,6 +134,10 @@ class ProjZEA; class Tiled; class NonTiled; +// State the pointing matrix interpolation options +struct NearestNeighbor { static const int interp_count = 1; }; +struct Bilinear { static const int interp_count = 4; }; + // Helper template for SpinSys... template class SpinClass { @@ -383,13 +423,14 @@ void Pointer::GetCoords(int i_det, int i_time, //! Pixelizors // -template +template class Pixelizor2_Flat; -template <> -class Pixelizor2_Flat { +template +class Pixelizor2_Flat { public: static const int index_count = 2; + static const int interp_count = Interpol::interp_count; Pixelizor2_Flat(int ny, int nx, double dy=1., double dx=1., double iy0=0., double ix0=0.) { @@ -444,6 +485,8 @@ class Pixelizor2_Flat { pixel_index[0] = int(iy); pixel_index[1] = int(ix); } + inline + int GetPixels(int i_det, int i_time, const double *coords, int pixinds[interp_count][index_count], FSIGNAL pixweights[interp_count]); bool TestInputs(bp::object &map, bool need_map, bool need_weight_map, int comp_count) { if (need_map) { @@ -487,10 +530,56 @@ class Pixelizor2_Flat { BufferWrapper mapbuf; }; -template <> -class Pixelizor2_Flat { +// Take care of the parts that depend on the interpolation order. +// First NearestNeighbor interpolation + +template<> +inline int Pixelizor2_Flat::GetPixels(int i_det, int i_time, const double *coords, int pixinds[interp_count][index_count], FSIGNAL pixweights[interp_count]) { + // For nearest neighbor this basically reduces to GetPixel + double x = coords[0] / cdelt[1] + crpix[1] - 1 + 0.5; + double y = coords[1] / cdelt[0] + crpix[0] - 1 + 0.5; + if (x < 0 || x >= naxis[1]) return 0; + if (y < 0 || y >= naxis[0]) return 0; + // SKN: On my laptop int(y)-int(y<0) takes 0.63 ns vs. 0.84 ns for int(floor(y)). + // Both are very fast, so maybe it's better to go for the latter, since it's clearer. + // For comparison the plain cast was 0.45 ns. + pixinds[0][0] = int(y)-int(y<0); + pixinds[0][1] = int(x)-int(x<0); + pixweights[0] = 1; + return 1; +} + +// Then Bilinear interpolation + +template<> +inline int Pixelizor2_Flat::GetPixels(int i_det, int i_time, const double *coords, int pixinds[interp_count][index_count], FSIGNAL pixweights[interp_count]) { + // For bilinear mapmaking we need to visit the four bounding pixels + double x = coords[0] / cdelt[1] + crpix[1] - 1 + 0.5; + double y = coords[1] / cdelt[0] + crpix[0] - 1 + 0.5; + int x1 = int(x)-int(x<0); + int y1 = int(y)-int(y<0); + double wx[2] = {x-x1, 1-(x-x1)}; + double wy[2] = {y-y1, 1-(y-y1)}; + // Loop through the our cases + int iout = 0; + for(int iy = y1; iy < y1+2; iy++) { + if(iy < 0 || iy >= naxis[0]) continue; + for(int ix = x1; ix < x1+2; ix++) { + if(ix < 0 || ix >= naxis[1]) continue; + pixinds[iout][0] = iy; + pixinds[iout][1] = ix; + pixweights[iout] = wy[iy-y1]*wx[ix-x1]; + iout++; + } + } + return iout; +} + +template +class Pixelizor2_Flat { public: static const int index_count = 3; + static const int interp_count = Interpol::interp_count; Pixelizor2_Flat() {}; Pixelizor2_Flat(int ny, int nx, double dy, double dx, double iy0, double ix0, int tiley, int tilex) { @@ -584,6 +673,8 @@ class Pixelizor2_Flat { pixel_index[1] = int(iy) - sub_y * tile_shape[0]; pixel_index[2] = int(ix) - sub_x * tile_shape[1]; } + inline + int GetPixels(int i_det, int i_time, const double *coords, int pixinds[interp_count][index_count], FSIGNAL pixweights[interp_count]); bool TestInputs(bp::object &map, bool need_map, bool need_weight_map, int comp_count) { vector map_shape_req; @@ -652,6 +743,51 @@ class Pixelizor2_Flat { vector> mapbufs; }; +// Take care of the parts that depend on the interpolation order. +// First NearestNeighbor interpolation +template<> +inline int Pixelizor2_Flat::GetPixels(int i_det, int i_time, const double *coords, int pixinds[interp_count][index_count], FSIGNAL pixweights[interp_count]) { + int ix = int(coords[0] / parent_pix.cdelt[1] + parent_pix.crpix[1] - 1 + 0.5); + if (ix < 0 || ix >= parent_pix.naxis[1]) return 0; + int iy = int(coords[1] / parent_pix.cdelt[0] + parent_pix.crpix[0] - 1 + 0.5); + if (iy < 0 || iy >= parent_pix.naxis[0]) return 0; + // Get the tile and tile offsets + int sub_y = iy / tile_shape[0]; + int sub_x = ix / tile_shape[1]; + pixinds[0][0] = sub_y * ((parent_pix.naxis[1] + tile_shape[1] - 1) / tile_shape[1]) + sub_x; + pixinds[0][1] = iy - sub_y * tile_shape[0]; + pixinds[0][2] = ix - sub_x * tile_shape[1]; + pixweights[0] = 1; + return 1; +} + +// Then Bilinear interpolation +template<> +inline int Pixelizor2_Flat::GetPixels(int i_det, int i_time, const double *coords, int pixinds[interp_count][index_count], FSIGNAL pixweights[interp_count]) { + // For bilinear mapmaking we need to visit the four bounding pixels + double x = coords[0] / parent_pix.cdelt[1] + parent_pix.crpix[1] - 1 + 0.5; + double y = coords[1] / parent_pix.cdelt[0] + parent_pix.crpix[0] - 1 + 0.5; + int x1 = int(x); + int y1 = int(y); + double wx[2] = {x-x1, 1-(x-x1)}; + double wy[2] = {y-y1, 1-(y-y1)}; + // Loop through the our cases + int iout = 0; + for(int iy = y1; iy < y1+2; iy++) { + if(iy < 0 || iy >= parent_pix.naxis[0]) continue; + int sub_y = iy / tile_shape[0]; + for(int ix = x1; ix < x1+2; ix++) { + if(ix < 0 || ix >= parent_pix.naxis[1]) continue; + int sub_x = ix / tile_shape[1]; + pixinds[iout][0] = sub_y * ((parent_pix.naxis[1] + tile_shape[1] - 1) / tile_shape[1]) + sub_x; + pixinds[iout][1] = iy - sub_y * tile_shape[0]; + pixinds[iout][2] = ix - sub_x * tile_shape[1]; + pixweights[iout] = wy[iy-y1]*wx[ix-x1]; + iout++; + } + } + return iout; +} template static inline @@ -860,6 +996,10 @@ bp::object ProjectionEngine::pixels( return pixel_buf_man.ret_val; } +// NB: This function assumes nearest neigbor. It doesn't look like +// it can be generalized with its current interface, since it returns +// an [ndet,ntime,{y,x,...}] array which can't handle multiple pixels per +// sample template bp::object ProjectionEngine::pointing_matrix( bp::object pbore, bp::object pofs, bp::object pixel, bp::object proj) @@ -922,70 +1062,92 @@ bp::object ProjectionEngine::pixel_ranges( if (from_map) _pixelizor.TestInputs(map, true, false, S::comp_count); - vector> ranges; - -#pragma omp parallel - { - if (n_domain <= 0) { - n_domain = 1; - #ifdef _OPENMP - n_domain = omp_get_num_threads(); - #endif - } -#pragma omp single - { - for (int i=0; i v(n_det); - for (auto &_v: v) - _v.count = n_time; - ranges.push_back(v); - } - } + // Default value of the number of domains to set up is one per available + // thread + if (n_domain <= 0) { + n_domain = 1; + #ifdef _OPENMP + n_domain = omp_get_max_threads(); + #endif + } -#pragma omp for - for (int i_det = 0; i_det < n_det; ++i_det) { - double dofs[4]; - pointer.InitPerDet(i_det, dofs); - int last_slice = -1; - int slice_start = 0; - int pixel_offset[P::index_count] = {-1}; - for (int i_time = 0; i_time < n_time; ++i_time) { - double coords[4]; - pointer.GetCoords(i_det, i_time, (double*)dofs, (double*)coords); - _pixelizor.GetPixel(i_det, i_time, (double*)coords, pixel_offset); - - int this_slice = -1; - if (from_map) { - if (pixel_offset[0] >= 0) - this_slice = *_pixelizor.pix(0, pixel_offset); - } else { - this_slice = _pixelizor.stripe(pixel_offset, n_domain); + // ranges is [nbunch,ndomain,ndet][nrange] + // we loop in series over bunches, then in parallel over + // the domains in each bunch. This could be set up multiple + // ways. Here, we use 2 bunches. The first has nthread domains, + // and takes care of the samples that unambiguously hit a domain. + // The second bunch contains just one pseudo-domain for all the samples + // that straddle a domain edge. These will be done in series. + vector>> ranges(2); + auto & par_ranges = ranges[0]; + auto & ser_ranges = ranges[1]; + auto empty_range = vector(n_det, RangesInt32(n_time)); + for (int i=0; i 0 && point_domain != samp_domain) { + // This is a mixed-domain sample. Put it in the serial catagory + samp_domain = n_domain; + break; } - if (this_slice != last_slice) { - if (last_slice >= 0) - ranges[last_slice][i_det].append_interval_no_check( - slice_start, i_time); - slice_start = i_time; - last_slice = this_slice; + samp_domain = point_domain; + } + // Did the domain status change? + if (samp_domain != last_domain) { + // Avoid triggering on the first sample, before we have established any domain + if (last_domain >= 0) { + // Assign to either a parallel domain or the serial misc category + auto & target_ranges = last_domain < n_domain ? par_ranges[last_domain] : ser_ranges[0]; + target_ranges[i_det].append_interval_no_check(domain_start, i_time); } + domain_start = i_time; + last_domain = samp_domain; } - if (last_slice >= 0) - ranges[last_slice][i_det].append_interval_no_check( - slice_start, n_time); + } + // Close the last interval + if (last_domain >= 0) { + auto & target_ranges = last_domain < n_domain ? par_ranges[last_domain] : ser_ranges[0]; + target_ranges[i_det].append_interval_no_check(domain_start, n_time); } } // Convert super vector to a list and return - auto ivals_out = bp::list(); - for (int j=0; j(domains)()); } - ivals_out.append(bp::extract(ivals)()); + ivals.append(bp::extract(bunches)()); } - return bp::extract(ivals_out); + return bp::extract(ivals); } template @@ -1022,7 +1184,8 @@ vector ProjectionEngine::tile_hits( for (int i_det = 0; i_det < n_det; ++i_det) { double dofs[4]; pointer.InitPerDet(i_det, dofs); - int pixel_offset[P::index_count] = {-1}; + int pixinds[P::interp_count][P::index_count] = {-1}; + FSIGNAL pixweights[P::interp_count]; int thread = 0; #ifdef _OPENMP thread = omp_get_thread_num(); @@ -1030,9 +1193,9 @@ vector ProjectionEngine::tile_hits( for (int i_time = 0; i_time < n_time; ++i_time) { double coords[4]; pointer.GetCoords(i_det, i_time, (double*)dofs, (double*)coords); - _pixelizor.GetPixel(i_det, i_time, (double*)coords, pixel_offset); - if (pixel_offset[0] >= 0) - temp[thread][pixel_offset[0]]++; + int n_point = _pixelizor.GetPixels(i_det, i_time, coords, pixinds, pixweights); + for(int i_point = 0; i_point < n_point; ++i_point) + temp[thread][pixinds[i_point][0]]++; } } #pragma omp single @@ -1064,6 +1227,7 @@ bp::object ProjectionEngine::tile_ranges( int n_tile = _pixelizor.tile_count(); if (n_tile < 0) throw general_exception("No tiles in this pixelization."); + int n_domain = bp::len(tile_lists); // Make a vector that maps tile into thread. vector thread_idx(n_tile, -1); @@ -1075,59 +1239,79 @@ bp::object ProjectionEngine::tile_ranges( } } - // Make a bunch of Ranges, index them with [thread, det]. Note - // thread isn't the OMP thread index, below, but the thread_index - // determined from tile_lists. It's safe to populate these - // structures in parallel provided that only one thread touches - // each Ranges object, which is the case since each OMP thread - // below specializes in certain detectors. - vector> ranges; - for (int i=0; i v(n_det); - for (auto &_v: v) - _v.count = n_time; - ranges.push_back(v); - } + // ranges is [nbunch,ndomain,ndet][nrange] + // we loop in series over bunches, then in parallel over + // the domains in each bunch. This could be set up multiple + // ways. Here, we use 2 bunches. The first has nthread domains, + // and takes care of the samples that unambiguously hit a domain. + // The second bunch contains just one pseudo-domain for all the samples + // that straddle a domain edge. These will be done in series. + vector>> ranges(2); + auto & par_ranges = ranges[0]; + auto & ser_ranges = ranges[1]; + auto empty_range = vector(n_det, RangesInt32(n_time)); + for (int i=0; i= 0) - this_slice = thread_idx[pixel_offset[0]]; - if (this_slice != last_slice) { - if (last_slice >= 0) - ranges[last_slice][i_det].append_interval_no_check( - slice_start, i_time); - slice_start = i_time; - last_slice = this_slice; + // set samp_domain from thread_idx if all points for this sample agree, otherwise + // set it to n_domain + int samp_domain = -1; + int n_point = _pixelizor.GetPixels(i_det, i_time, coords, pixinds, pixweights); + for(int i_point = 0; i_point < n_point; ++i_point) { + int point_domain = thread_idx[pixinds[i_point][0]]; + if(i_point > 0 && point_domain != samp_domain) { + // This is a mixed-domain sample. Put it in the serial catagory + samp_domain = n_domain; + break; + } + samp_domain = point_domain; } + // Did the domain status change? + if (samp_domain != last_domain) { + // Avoid triggering on the first sample, before we have established any domain + if (last_domain >= 0) { + // Assign to either a parallel domain or the serial misc category + auto & target_ranges = last_domain < n_domain ? par_ranges[last_domain] : ser_ranges[0]; + target_ranges[i_det].append_interval_no_check(domain_start, i_time); + } + domain_start = i_time; + last_domain = samp_domain; + } + } + // Close the last interval + if (last_domain >= 0) { + auto & target_ranges = last_domain < n_domain ? par_ranges[last_domain] : ser_ranges[0]; + target_ranges[i_det].append_interval_no_check(domain_start, n_time); } - if (last_slice >= 0) - ranges[last_slice][i_det].append_interval_no_check( - slice_start, n_time); } - // Convert vector> to a list and return it. - auto ivals_out = bp::list(); - for (int j=0; j(domains)()); } - ivals_out.append(bp::extract(ivals)()); + ivals.append(bp::extract(bunches)()); } - return bp::extract(ivals_out); + return bp::extract(ivals); } template @@ -1174,30 +1358,29 @@ bp::object ProjectionEngine::from_map( for (int i_det = 0; i_det < n_det; ++i_det) { double dofs[4]; pointer.InitPerDet(i_det, dofs); - int pixel_offset[P::index_count] = {-1}; + int pixinds[P::interp_count][P::index_count] = {-1}; + FSIGNAL pixweights[P::interp_count]; FSIGNAL pf[S::comp_count]; for (int i_time = 0; i_time < n_time; ++i_time) { double coords[4]; pointer.GetCoords(i_det, i_time, (double*)dofs, (double*)coords); - _pixelizor.GetPixel(i_det, i_time, (double*)coords, pixel_offset); - if (pixel_offset[0] < 0) continue; spin_proj_factors(coords, pf); - FSIGNAL *sig = (_signalspace.data_ptr[i_det] + - _signalspace.steps[0]*i_time); - for (int imap=0; imap static void to_map_single_thread(Pointer &pointer, P &_pixelizor, - vector ivals, + const vector & ivals, BufferWrapper &_det_weights, SignalSpace *_signalspace) { @@ -1209,19 +1392,22 @@ void to_map_single_thread(Pointer &pointer, _det_weights->strides[0]*i_det); double dofs[4]; double coords[4]; - int pixel_offset[P::index_count] = {-1}; FSIGNAL pf[S::comp_count]; pointer.InitPerDet(i_det, dofs); + // Pointing matrix interpolation stuff + int pixinds[P::interp_count][P::index_count] = {-1}; + FSIGNAL pixweights[P::interp_count] = {0}; for (auto const &rng: ivals[i_det].segments) { for (int i_time = rng.first; i_time < rng.second; ++i_time) { pointer.GetCoords(i_det, i_time, (double*)dofs, (double*)coords); - _pixelizor.GetPixel(i_det, i_time, (double*)coords, pixel_offset); - if (pixel_offset[0] < 0) continue; - const FSIGNAL sig = *(_signalspace->data_ptr[i_det] + - _signalspace->steps[0]*i_time); spin_proj_factors(coords, pf); - for (int imap=0; imapdata_ptr[i_det][_signalspace->steps[0]*i_time]; + // In interpolated mapmaking like bilinear mampamking, each sample hits multipe + // pixels, each with its own weight. + int n_point = _pixelizor.GetPixels(i_det, i_time, coords, pixinds, pixweights); + for(int i_point = 0; i_point < n_point; ++i_point) + for (int i_map=0; i_map &pointer, _det_weights->strides[0]*i_det); double dofs[4]; double coords[4]; - int pixel_offset[P::index_count] = {-1}; FSIGNAL pf[S::comp_count]; pointer.InitPerDet(i_det, dofs); + int pixinds[P::interp_count][P::index_count] = {-1}; + FSIGNAL pixweights[P::interp_count] = {0}; for (auto const &rng: ivals[i_det].segments) { for (int i_time = rng.first; i_time < rng.second; ++i_time) { pointer.GetCoords(i_det, i_time, (double*)dofs, (double*)coords); - _pixelizor.GetPixel(i_det, i_time, (double*)coords, pixel_offset); - if (pixel_offset[0] < 0) continue; spin_proj_factors(coords, pf); - for (int imap=0; imap> derive_ranges( +vector>> derive_ranges( bp::object thread_intervals, int n_det, int n_time, std::string arg_name) { @@ -1273,37 +1466,55 @@ vector> derive_ranges( // - List of Ranges: This will be promoted to a single thread. // - List containing N lists of Ranges: N threads, with each list // used by corresponding thread. + // - List containing M lists of N[M] lists of Ranges: M bunches + // which will be looped over in serial, each containing N[M] + // threads-lists that will be done in parallel. + vector>> ivals; - vector> ivals; if (isNone(thread_intervals)) { + // It's None. Generate a single bunch with a single-thread covering all samples auto r = RangesInt32(n_time).add_interval(0, n_time); - vector v(n_det, r); + vector> v(1, vector(n_det, r)); ivals.push_back(v); + } else if(bp::extract(thread_intervals[0]).check()) { + // It's a RangesMatrix (ndet,nranges). Promote to single thread, single bunch + ivals.push_back(vector>(1, extract_ranges(thread_intervals))); + } else if(bp::extract(thread_intervals[0][0]).check()) { + // It's a per-thread RangesMatrix (nthread,ndet,nranges). Promote to single bunch + vector> bunch; + for (int i=0; i(thread_intervals[i])); + ivals.push_back(bunch); + } else if(bp::extract(thread_intervals[0][0][0]).check()) { + // It's a full multi-bunch (nbunch,nthread,ndet,nranges) thing. + for (int i=0; i> bunch; + for (int j=0; j(thread_intervals[i][j])); + ivals.push_back(bunch); + } } else { - // Don't convert to a list... just use indexing. - if (bp::extract(thread_intervals[0]).check()) { - ivals.push_back(extract_ranges(thread_intervals)); - } else { - // Assumed it is list of lists of ranges. - for (int i=0; i(thread_intervals[i])); + // This should not happen + assert(false); + } + // Check that these all have the right shape. Maybe consider a standard + // for loop instead of foreach to give more useful error messages using + // the index + for(auto const & bunch: ivals) + for(auto const & ival: bunch) { + if (ival.size() != n_det) { + std::ostringstream err; + err << "Expected RangesMatrix with n_det=" << n_det + << " but encountered n_det=" << ival.size(); + throw shape_exception(arg_name, err.str()); } - // Check that these all have the right shape. - for (auto const &ival: ivals) { - if (ival.size() != n_det) { + for (auto const &ri: ival) { + if (ri.count != n_time) { std::ostringstream err; - err << "Expected RangesMatrix with n_det=" << n_det - << " but encountered n_det=" << ival.size(); + err << "Expected RangesMatrix with n_time=" << n_time + << " but encountered n_time=" << ri.count; throw shape_exception(arg_name, err.str()); } - for (auto const &ri: ival) { - if (ri.count != n_time) { - std::ostringstream err; - err << "Expected RangesMatrix with n_time=" << n_time - << " but encountered n_time=" << ri.count; - throw shape_exception(arg_name, err.str()); - } - } } } return ivals; @@ -1333,35 +1544,20 @@ bp::object ProjectionEngine::to_map( auto _det_weights = BufferWrapper( "det_weights", det_weights, true, vector{n_det}); - // For multi-threading, the principle here is that all threads - // loop over all detectors, but the sample ranges encoded in ivals - // are disjoint. - auto ivals = derive_ranges(thread_intervals, n_det, n_time, + // For multi-threading, the principle here is that we loop serially + // over bunches, and then inside each block all threads loop over + // all detectors in parallel, but the sample ranges are pixel-disjoint. + auto bunches = derive_ranges(thread_intervals, n_det, n_time, "thread_intervals"); - if (ivals.size() <= 1) { - // This block may also be used if OMP is disabled. - for (int i_thread=0; i_thread < ivals.size(); i_thread++) - to_map_single_thread( - pointer, _pixelizor, ivals[i_thread],_det_weights, &_signalspace); - } else { -#pragma omp parallel - { - // This loop construction handles the (sub-optimal) case - // that the number of ivals is not equal to the number of - // OMP threads. - int num_threads = 1; - int thread_num = 0; - #ifdef _OPENMP - num_threads = omp_get_num_threads(); - thread_num = omp_get_thread_num(); - #endif - for (int i_thread=0; i_thread < ivals.size(); i_thread++) { - if (i_thread % num_threads == thread_num) - to_map_single_thread( - pointer, _pixelizor, ivals[i_thread], _det_weights, &_signalspace); - } - } + // First loop over serial bunches + for(int i_bunch = 0; i_bunch < bunches.size(); i_bunch++) { + const auto & ivals = bunches[i_bunch]; + // Then loop over parallel bunches. This works even if omp is not enabled, + // or if ivals.size() == 1 + #pragma omp parallel for + for (int i_thread = 0; i_thread < ivals.size(); i_thread++) + to_map_single_thread(pointer, _pixelizor, ivals[i_thread], _det_weights, &_signalspace); } return map; } @@ -1390,46 +1586,33 @@ bp::object ProjectionEngine::to_weight_map( auto _det_weights = BufferWrapper( "det_weights", det_weights, true, vector{n_det}); - // For multi-threading, the principle here is that all threads - // loop over all detectors, but the sample ranges encoded in ivals - // are disjoint. - auto ivals = derive_ranges(thread_intervals, n_det, n_time, + // For multi-threading, the principle here is that we loop serially + // over bunches, and then inside each block all threads loop over + // all detectors in parallel, but the sample ranges are pixel-disjoint. + auto bunches = derive_ranges(thread_intervals, n_det, n_time, "thread_intervals"); - if (ivals.size() <= 1) { - // This block may also be used if OMP is disabled. - for (int i_thread=0; i_thread < ivals.size(); i_thread++) - to_weight_map_single_thread( - pointer, _pixelizor, ivals[i_thread], _det_weights); - } else { -#pragma omp parallel - { - // This loop construction handles the (sub-optimal) case - // that the number of ivals is not equal to the number of - // OMP threads. - int num_threads = 1; - int thread_num = 0; - #ifdef _OPENMP - num_threads = omp_get_num_threads(); - thread_num = omp_get_thread_num(); - #endif - for (int i_thread=0; i_thread < ivals.size(); i_thread++) { - if (i_thread % num_threads == thread_num) - to_weight_map_single_thread( - pointer, _pixelizor, ivals[i_thread], _det_weights); - } - } + // First loop over serial bunches + for(int i_bunch = 0; i_bunch < bunches.size(); i_bunch++) { + const auto & ivals = bunches[i_bunch]; + // Then loop over parallel bunches. This works even if omp is not enabled, + // or if ivals.size() == 1 + #pragma omp parallel for + for (int i_thread = 0; i_thread < ivals.size(); i_thread++) + to_weight_map_single_thread(pointer, _pixelizor, ivals[i_thread], _det_weights); } return map; } - // ProjEng_Precomp // // The ProjEng_Precomp may be used for very fast projection // operations, provided you have precomputed pixel index and spin // projection factors for every sample. This is agnostic of -// coordinate system, and so not crazily templated. +// coordinate system, and so not crazily templated. It is +// hard-coded to nearest neighbor interpolation though +// (though one could emulate interpolation by calling it repeatedly +// with different pixel indices and weights). // template @@ -1599,34 +1782,18 @@ bp::object ProjEng_Precomp::to_map( throw shape_exception("pixel_index", "Fast dimension of pixel indices must be close-packed."); - auto ivals = derive_ranges(thread_intervals, n_det, n_time, - "thread_intervals"); + auto bunches = derive_ranges(thread_intervals, n_det, n_time, "thread_intervals"); - if (ivals.size() <= 1) { - // This block may also be used if OMP is disabled. - for (int i_thread=0; i_thread < ivals.size(); i_thread++) + // First loop over serial bunches + for(int i_bunch = 0; i_bunch < bunches.size(); i_bunch++) { + const auto & ivals = bunches[i_bunch]; + // Then loop over parallel bunches. This works even if omp is not enabled, + // or if ivals.size() == 1 + #pragma omp parallel for + for (int i_thread = 0; i_thread < ivals.size(); i_thread++) precomp_to_map_single_thread( pixelizor, pixel_buf_man, spin_proj_man, - ivals[i_thread],_det_weights, &_signalspace); - } else { -#pragma omp parallel - { - // This loop construction handles the (sub-optimal) case - // that the number of ivals is not equal to the number of - // OMP threads. - int num_threads = 1; - int thread_num = 0; - #ifdef _OPENMP - num_threads = omp_get_num_threads(); - thread_num = omp_get_thread_num(); - #endif - for (int i_thread=0; i_thread < ivals.size(); i_thread++) { - if (i_thread % num_threads == thread_num) - precomp_to_map_single_thread( - pixelizor, pixel_buf_man, spin_proj_man, - ivals[i_thread], _det_weights, &_signalspace); - } - } + ivals[i_thread], _det_weights, &_signalspace); } return map; } @@ -1662,34 +1829,18 @@ bp::object ProjEng_Precomp::to_weight_map( throw shape_exception("pixel_index", "Fast dimension of pixel indices must be close-packed."); - auto ivals = derive_ranges(thread_intervals, n_det, n_time, - "thread_intervals"); + auto bunches = derive_ranges(thread_intervals, n_det, n_time, "thread_intervals"); - if (ivals.size() <= 1) { - // This block may also be used if OMP is disabled. - for (int i_thread=0; i_thread < ivals.size(); i_thread++) + // First loop over serial bunches + for(int i_bunch = 0; i_bunch < bunches.size(); i_bunch++) { + const auto & ivals = bunches[i_bunch]; + // Then loop over parallel bunches. This works even if omp is not enabled, + // or if ivals.size() == 1 + #pragma omp parallel for + for (int i_thread = 0; i_thread < ivals.size(); i_thread++) precomp_to_weight_map_single_thread( pixelizor, pixel_buf_man, spin_proj_man, - ivals[i_thread],_det_weights); - } else { -#pragma omp parallel - { - // This loop construction handles the (sub-optimal) case - // that the number of ivals is not equal to the number of - // OMP threads. - int num_threads = 1; - int thread_num = 0; - #ifdef _OPENMP - num_threads = omp_get_num_threads(); - thread_num = omp_get_thread_num(); - #endif - for (int i_thread=0; i_thread < ivals.size(); i_thread++) { - if (i_thread % num_threads == thread_num) - precomp_to_weight_map_single_thread( - pixelizor, pixel_buf_man, spin_proj_man, - ivals[i_thread], _det_weights); - } - } + ivals[i_thread], _det_weights); } return map; } @@ -1698,10 +1849,13 @@ typedef ProjEng_Precomp ProjEng_Precomp_NonTiled; typedef ProjEng_Precomp ProjEng_Precomp_Tiled; #define PROJENG(PIX, SPIN, TILING) ProjEng_##PIX##_##SPIN##_##TILING +#define PROJENG_INTERP(PIX, SPIN, TILING, INTERP) ProjEng_##PIX##_##SPIN##_##TILING##_##INTERP -#define TYPEDEF_TILING(PIX, SPIN, TILING) \ +#define TYPEDEF_TILING(PIX, SPIN, TILING) \ typedef ProjectionEngine,Spin##SPIN> \ - PROJENG(PIX, SPIN, TILING); + PROJENG(PIX, SPIN, TILING); \ + typedef ProjectionEngine,Spin##SPIN> \ + PROJENG_INTERP(PIX, SPIN, TILING, Bilinear); #define TYPEDEF_SPIN(PIX, SPIN) \ TYPEDEF_TILING(PIX, SPIN, Tiled) \ @@ -1740,7 +1894,8 @@ TYPEDEF_PIX(ZEA) #define EXPORT_TILING(PIX, SPIN, TILING) \ - EXPORT_ENGINE(PROJENG(PIX, SPIN, TILING)) + EXPORT_ENGINE(PROJENG(PIX, SPIN, TILING)) \ + EXPORT_ENGINE(PROJENG_INTERP(PIX, SPIN, TILING, Bilinear)) #define EXPORT_SPIN(PIX, SPIN) \ EXPORT_TILING(PIX, SPIN, Tiled) \ diff --git a/test/test_proj_eng.py b/test/test_proj_eng.py index 18c255bc..95652d5a 100644 --- a/test/test_proj_eng.py +++ b/test/test_proj_eng.py @@ -103,13 +103,14 @@ def test_20_threads(self): continue else: threads = p.assign_threads(asm, method=method, n_threads=n_threads) - self.assertEqual(threads.shape, (n_threads,) + sig.shape, + # This may need to be generalized if we implement fancier threads schemes. + self.assertEqual(threads[0].shape, (n_threads,) + sig.shape, msg=f'threads has wrong shape ({detail})') # Make sure the threads cover the TOD, or not, - # depending on clipped. - counts = np.zeros(threads.shape[1:], int) - for t in threads: + # depending on clipped. This may also need generalization + counts = np.zeros(threads[0].shape[1:], int) + for t in threads[0]: counts += t.mask() target = set([0,1]) if clipped else set([1]) self.assertEqual(set(counts.ravel()), target,