diff --git a/python/proj/wcs.py b/python/proj/wcs.py
index ee1d3b59..e9023b8b 100644
--- a/python/proj/wcs.py
+++ b/python/proj/wcs.py
@@ -90,6 +90,13 @@ class Projectionist:
       operations.  Such objects should satisfy the condition that
       threads[x,j]*threads[y,j] is the empty Range for x != y;
       i.e. each detector-sample is assigned to at most one thread.
+    * ``interpol``: How positions that fall between pixel centers will
+      be handled. Options are "nearest" (default): Use Nearest
+      Neighbor interpolation, so a sample takes the value of
+      whatever pixel is closest; or "bilinear": linearly
+      interpolate between the four closest pixels. bilinear is
+      slower (around 60%) but avoids problems caused by a
+      discontinuous model.
 
     Attributes:
         naxis: 2-element integer array specifying the map shape (for
@@ -146,13 +153,12 @@ def tiling(self):
         return _Tiling(self.naxis[::-1], self.tile_shape)
 
     @classmethod
-    def for_geom(cls, shape, wcs):
+    def for_geom(cls, shape, wcs, interpol=None):
         """Construct a Projectionist for use with the specified "geometry".
 
         The shape and wcs are the core information required to prepare
         a Projectionist, so this method is likely to be called by
         other constructors.
-
         """
         self = cls()
         ax1, ax2 = wcs.wcs.lng, wcs.wcs.lat
@@ -177,10 +183,14 @@ def for_geom(cls, shape, wcs):
         self.cdelt = np.array(wcs.wcs.cdelt) * quat.DEG
         self.crpix = np.array(wcs.wcs.crpix)
 
+        # Pixel interpolation mode
+        if interpol is None: interpol = "nearest"
+        self.interpol = interpol
+
         return self
 
     @classmethod
-    def for_map(cls, emap, wcs=None):
+    def for_map(cls, emap, wcs=None, interpol=None):
         """Construct a Projectionist for use with maps having the same
         geometry as the provided enmap.
 
@@ -190,7 +200,6 @@ def for_map(cls, emap, wcs=None):
             with shape attribute), provided that wcs is provided
             separately.
           wcs: optional WCS object to use instead of emap.wcs.
-
         """
         if wcs is None:
             wcs = emap.wcs
@@ -214,7 +223,7 @@ def for_source_at(cls, alpha0, delta0, gamma0=0.,
         return self
 
     @classmethod
-    def for_tiled(cls, shape, wcs, tile_shape, active_tiles=True):
+    def for_tiled(cls, shape, wcs, tile_shape, active_tiles=True, interpol=None):
         """Construct a Projectionist for use with the specified geometry
         (shape, wcs), cut into tiles of shape tile_shape.
 
@@ -232,7 +241,7 @@ def for_tiled(cls, shape, wcs, tile_shape, active_tiles=True):
                 populate on such calls.
 
         """
-        self = cls.for_geom(shape, wcs)
+        self = cls.for_geom(shape, wcs, interpol=interpol)
         self.tile_shape = np.array(tile_shape, 'int')
         if active_tiles is True:
             self.active_tiles = list(range(self.tiling.tile_count))
@@ -260,15 +269,19 @@ def _get_pixelizor_args(self):
         return args
 
     def get_ProjEng(self, comps='TQU', proj_name=None, get=True,
-                    instance=True):
+                    instance=True, interpol=None):
         """Returns an so3g.ProjEng object appropriate for use with the
         configured geometry.
 
         """
-        if proj_name is None:
-            proj_name = self.proj_name
+        if proj_name is None: proj_name = self.proj_name
         tile_suffix = 'Tiled' if self.tiling else 'NonTiled'
-        projeng_name = f'ProjEng_{proj_name}_{comps}_{tile_suffix}'
+        # Interpolation mode
+        if interpol is None: interpol = self.interpol
+        if interpol in ["nn", "nearest"]: interpol_suffix = ""
+        elif interpol in ["lin", "bilinear"]: interpol_suffix = "_Bilinear"
+        else: raise ValueError("ProjEng interpol '%s' not recognized" % str(interpol))
+        projeng_name = f'ProjEng_{proj_name}_{comps}_{tile_suffix}{interpol_suffix}'
         if not get:
             return projeng_name
         try:
@@ -499,7 +512,7 @@ def assign_threads(self, assembly, method='domdir', n_threads=None):
             projeng = self.get_ProjEng('T')
             q_native = self._cache_q_fp_to_native(assembly.Q)
             omp_ivals = projeng.pixel_ranges(q_native, assembly.dets, None, n_threads)
-            return RangesMatrix([RangesMatrix(x) for x in omp_ivals])
+            return wrap_ivals(omp_ivals)
 
         elif method == 'domdir':
             offs_rep = assembly.dets[::100]
@@ -539,7 +552,7 @@ def assign_threads_from_map(self, assembly, tmap, n_threads=None):
         q_native = self._cache_q_fp_to_native(assembly.Q)
         n_threads = mapthreads.get_num_threads(n_threads)
         omp_ivals = projeng.pixel_ranges(q_native, assembly.dets, tmap, n_threads)
-        return RangesMatrix([RangesMatrix(x) for x in omp_ivals])
+        return wrap_ivals(omp_ivals)
 
     def get_active_tiles(self, assembly, assign=False):
         """For a tiled Projection, figure out what tiles are hit by an
@@ -592,7 +605,7 @@ def get_active_tiles(self, assembly, assign=False):
                 group_tiles[idx].append(_t)
             # Now paint them into Ranges.
             R = projeng.tile_ranges(q_native, assembly.dets, group_tiles)
-            R = RangesMatrix([RangesMatrix(r) for r in R])
+            R = wrap_ivals(R)
             return {
                 'active_tiles': list(tiles),
                 'hit_counts': list(hits),
@@ -604,6 +617,8 @@ def get_active_tiles(self, assembly, assign=False):
             'hit_counts': list(hits),
         }
 
+    _ivals_format = 2
+
 
 class _Tiling:
     """Utility class for computations involving tiled maps.
@@ -629,6 +644,8 @@ def tile_offset(self, tile):
         row, col = self.tile_rowcol(tile)
         return row * self.tile_shape[0], col * self.tile_shape[1]
 
+def wrap_ivals(ivals):
+    return RangesMatrix([RangesMatrix([RangesMatrix(y) for y in x]) for x in ivals])
 
 THREAD_ASSIGNMENT_METHODS = [
     'simple',
diff --git a/src/Projection.cxx b/src/Projection.cxx
index 556aad1b..f83b771b 100644
--- a/src/Projection.cxx
+++ b/src/Projection.cxx
@@ -84,6 +84,42 @@ inline bool isNone(const bp::object &pyo)
 //   quite easily; the most obvious candidate being spin-1, to carry
 //   information such as in-plane pointing displacements.
 
+// Bilinear mapmaking (and interpolated mapmaking in general)
+//
+// Originally only nearest-neighbor mapmaking was supported. Here,
+// a sample takes the value of the nearest pixel, regardless of where
+// inside this pixel it hits. This means each sample only needs to
+// care about a single pixel, which is computed by Pixelizor's GetPixel.
+// This simple relationship makes it elatively easy to avoid
+// clobbering when using threads in the tod2map operation, as one could
+// assign responsibility for different areas of pixels to different threads,
+// and translate this directly into sample ranges for those threads.
+//
+// In interpolated mapmaking like bilinear mapmaking, the value of a sample
+// is given by some linear combination of several of the nearest pixels.
+// For bilinear mapmaking, these are the four pixels above/below and left/right
+// of the sample, which are each weighted based on how close each pixel is to
+// the sample. The pixels and corresponding weights are calculated using
+// Pixelizor's GetPixels function, which in turn uses the Pixelizor's Interpol
+// template argument to determine which type of interpolation should be used
+// (including the "no interpolation" nearest neighbor case).
+//
+// However, since each sample now hits multiple pixels, it is more difficult
+// build a set of sample ranges that touch disjoint sets of pixels and so can
+// be iterated over in parallel without locking in the tod2map operation.
+// There will always be some samples that hit the edge of each thread's pixel
+// groups, resulting in it affecting multiple threads' pixels. We handle this
+// by generalizing the sample ranges from ranges = vector<vector<RangesInt32>>, which
+// is [nthread][ndet][nrange] in shape, to ranges = vector<vector<vector<RangesInt32>>>
+// which is [nbunch][nthread][ndet][nrange], and where we iterate over the
+// outermost "bunch" level in series, and then in parallel over the next level.
+// This general framework should work for all parallelization schemes, but what
+// we've implemented so far is a simple case where all pixels that unambiguously
+// fall inside a single threads' pixel domain are done as before, and then thew few samples
+// that fall in multiple domains are done by a single thread in the end. This
+// means that ranges[0] has shape [nthread][ndet][nrange] while ranges[1] has
+// shape [1][ndet][nrange]. More complicated schemes could be used, but this
+// seems to work well enough.
 
 // State the template<CoordSys> options
 class ProjQuat;
@@ -98,6 +134,10 @@ class ProjZEA;
 class Tiled;
 class NonTiled;
 
+// State the pointing matrix interpolation options
+struct NearestNeighbor { static const int interp_count = 1; };
+struct Bilinear        { static const int interp_count = 4; };
+
 // Helper template for SpinSys...
 template <int N>
 class SpinClass {
@@ -383,13 +423,14 @@ void Pointer<ProjCAR>::GetCoords(int i_det, int i_time,
 
 //! Pixelizors
 //
-template <typename TilingSys>
+template <typename TilingSys, typename Interpol = NearestNeighbor>
 class Pixelizor2_Flat;
 
-template <>
-class Pixelizor2_Flat<NonTiled> {
+template <typename Interpol>
+class Pixelizor2_Flat<NonTiled, Interpol> {
 public:
     static const int index_count = 2;
+    static const int interp_count = Interpol::interp_count;
     Pixelizor2_Flat(int ny, int nx,
                     double dy=1., double dx=1.,
                     double iy0=0., double ix0=0.) {
@@ -444,6 +485,8 @@ class Pixelizor2_Flat<NonTiled> {
         pixel_index[0] = int(iy);
         pixel_index[1] = int(ix);
     }
+    inline
+    int GetPixels(int i_det, int i_time, const double *coords, int pixinds[interp_count][index_count], FSIGNAL pixweights[interp_count]);
 
     bool TestInputs(bp::object &map, bool need_map, bool need_weight_map, int comp_count) {
         if (need_map) {
@@ -487,10 +530,56 @@ class Pixelizor2_Flat<NonTiled> {
     BufferWrapper<double> mapbuf;
 };
 
-template <>
-class Pixelizor2_Flat<Tiled> {
+// Take care of the parts that depend on the interpolation order.
+// First NearestNeighbor interpolation
+
+template<>
+inline int Pixelizor2_Flat<NonTiled, NearestNeighbor>::GetPixels(int i_det, int i_time, const double *coords, int pixinds[interp_count][index_count], FSIGNAL pixweights[interp_count]) {
+    // For nearest neighbor this basically reduces to GetPixel
+    double x = coords[0] / cdelt[1] + crpix[1] - 1 + 0.5;
+    double y = coords[1] / cdelt[0] + crpix[0] - 1 + 0.5;
+    if (x < 0 || x >= naxis[1]) return 0;
+    if (y < 0 || y >= naxis[0]) return 0;
+    // SKN: On my laptop int(y)-int(y<0) takes 0.63 ns vs. 0.84 ns for int(floor(y)).
+    // Both are very fast, so maybe it's better to go for the latter, since it's clearer.
+    // For comparison the plain cast was 0.45 ns.
+    pixinds[0][0] = int(y)-int(y<0);
+    pixinds[0][1] = int(x)-int(x<0);
+    pixweights[0] = 1;
+    return 1;
+}
+
+// Then Bilinear interpolation
+
+template<>
+inline int Pixelizor2_Flat<NonTiled, Bilinear>::GetPixels(int i_det, int i_time, const double *coords, int pixinds[interp_count][index_count], FSIGNAL pixweights[interp_count]) {
+    // For bilinear mapmaking we need to visit the four bounding pixels
+    double x  = coords[0] / cdelt[1] + crpix[1] - 1 + 0.5;
+    double y  = coords[1] / cdelt[0] + crpix[0] - 1 + 0.5;
+    int    x1 = int(x)-int(x<0);
+    int    y1 = int(y)-int(y<0);
+    double wx[2] = {x-x1, 1-(x-x1)};
+    double wy[2] = {y-y1, 1-(y-y1)};
+    // Loop through the our cases
+    int iout = 0;
+    for(int iy = y1; iy < y1+2; iy++) {
+        if(iy < 0 || iy >= naxis[0]) continue;
+        for(int ix = x1; ix < x1+2; ix++) {
+            if(ix < 0 || ix >= naxis[1]) continue;
+            pixinds[iout][0] = iy;
+            pixinds[iout][1] = ix;
+            pixweights[iout] = wy[iy-y1]*wx[ix-x1];
+            iout++;
+        }
+    }
+    return iout;
+}
+
+template <typename Interpol>
+class Pixelizor2_Flat<Tiled, Interpol> {
 public:
     static const int index_count = 3;
+    static const int interp_count = Interpol::interp_count;
     Pixelizor2_Flat() {};
     Pixelizor2_Flat(int ny, int nx, double dy, double dx,
                     double iy0, double ix0, int tiley, int tilex) {
@@ -584,6 +673,8 @@ class Pixelizor2_Flat<Tiled> {
         pixel_index[1] = int(iy) - sub_y * tile_shape[0];
         pixel_index[2] = int(ix) - sub_x * tile_shape[1];
     }
+    inline
+    int GetPixels(int i_det, int i_time, const double *coords, int pixinds[interp_count][index_count], FSIGNAL pixweights[interp_count]);
 
     bool TestInputs(bp::object &map, bool need_map, bool need_weight_map, int comp_count) {
         vector<int> map_shape_req;
@@ -652,6 +743,51 @@ class Pixelizor2_Flat<Tiled> {
     vector<BufferWrapper<double>> mapbufs;
 };
 
+// Take care of the parts that depend on the interpolation order.
+// First NearestNeighbor interpolation
+template<>
+inline int Pixelizor2_Flat<Tiled, NearestNeighbor>::GetPixels(int i_det, int i_time, const double *coords, int pixinds[interp_count][index_count], FSIGNAL pixweights[interp_count]) {
+    int ix = int(coords[0] / parent_pix.cdelt[1] + parent_pix.crpix[1] - 1 + 0.5);
+    if (ix < 0 || ix >= parent_pix.naxis[1]) return 0;
+    int iy = int(coords[1] / parent_pix.cdelt[0] + parent_pix.crpix[0] - 1 + 0.5);
+    if (iy < 0 || iy >= parent_pix.naxis[0]) return 0;
+    // Get the tile and tile offsets
+    int sub_y = iy / tile_shape[0];
+    int sub_x = ix / tile_shape[1];
+    pixinds[0][0] = sub_y * ((parent_pix.naxis[1] + tile_shape[1] - 1) / tile_shape[1]) + sub_x;
+    pixinds[0][1] = iy - sub_y * tile_shape[0];
+    pixinds[0][2] = ix - sub_x * tile_shape[1];
+    pixweights[0] = 1;
+    return 1;
+}
+
+// Then Bilinear interpolation
+template<>
+inline int Pixelizor2_Flat<Tiled, Bilinear>::GetPixels(int i_det, int i_time, const double *coords, int pixinds[interp_count][index_count], FSIGNAL pixweights[interp_count]) {
+    // For bilinear mapmaking we need to visit the four bounding pixels
+    double x  = coords[0] / parent_pix.cdelt[1] + parent_pix.crpix[1] - 1 + 0.5;
+    double y  = coords[1] / parent_pix.cdelt[0] + parent_pix.crpix[0] - 1 + 0.5;
+    int    x1 = int(x);
+    int    y1 = int(y);
+    double wx[2] = {x-x1, 1-(x-x1)};
+    double wy[2] = {y-y1, 1-(y-y1)};
+    // Loop through the our cases
+    int iout = 0;
+    for(int iy = y1; iy < y1+2; iy++) {
+        if(iy < 0 || iy >= parent_pix.naxis[0]) continue;
+        int sub_y = iy / tile_shape[0];
+        for(int ix = x1; ix < x1+2; ix++) {
+            if(ix < 0 || ix >= parent_pix.naxis[1]) continue;
+            int sub_x = ix / tile_shape[1];
+            pixinds[iout][0] = sub_y * ((parent_pix.naxis[1] + tile_shape[1] - 1) / tile_shape[1]) + sub_x;
+            pixinds[iout][1] = iy - sub_y * tile_shape[0];
+            pixinds[iout][2] = ix - sub_x * tile_shape[1];
+            pixweights[iout] = wy[iy-y1]*wx[ix-x1];
+            iout++;
+        }
+    }
+    return iout;
+}
 
 template <typename SpinSys>
 static inline
@@ -860,6 +996,10 @@ bp::object ProjectionEngine<C,P,S>::pixels(
     return pixel_buf_man.ret_val;
 }
 
+// NB: This function assumes nearest neigbor. It doesn't look like
+// it can be generalized with its current interface, since it returns
+// an [ndet,ntime,{y,x,...}] array which can't handle multiple pixels per
+// sample
 template<typename C, typename P, typename S>
 bp::object ProjectionEngine<C,P,S>::pointing_matrix(
     bp::object pbore, bp::object pofs, bp::object pixel, bp::object proj)
@@ -922,70 +1062,92 @@ bp::object ProjectionEngine<C,P,S>::pixel_ranges(
     if (from_map)
         _pixelizor.TestInputs(map, true, false, S::comp_count);
 
-    vector<vector<RangesInt32>> ranges;
-
-#pragma omp parallel
-    {
-        if (n_domain <= 0) {
-            n_domain = 1;
-            #ifdef _OPENMP
-            n_domain = omp_get_num_threads();
-            #endif
-        }
-#pragma omp single
-        {
-            for (int i=0; i<n_domain; i++) {
-                vector<RangesInt32> v(n_det);
-                for (auto &_v: v)
-                    _v.count = n_time;
-                ranges.push_back(v);
-            }
-        }
+    // Default value of the number of domains to set up is one per available
+    // thread
+    if (n_domain <= 0) {
+        n_domain = 1;
+        #ifdef _OPENMP
+        n_domain = omp_get_max_threads();
+        #endif
+    }
 
-#pragma omp for
-        for (int i_det = 0; i_det < n_det; ++i_det) {
-            double dofs[4];
-            pointer.InitPerDet(i_det, dofs);
-            int last_slice = -1;
-            int slice_start = 0;
-            int pixel_offset[P::index_count] = {-1};
-            for (int i_time = 0; i_time < n_time; ++i_time) {
-                double coords[4];
-                pointer.GetCoords(i_det, i_time, (double*)dofs, (double*)coords);
-                _pixelizor.GetPixel(i_det, i_time, (double*)coords, pixel_offset);
-
-                int this_slice = -1;
-                if (from_map) {
-                    if (pixel_offset[0] >= 0)
-                        this_slice = *_pixelizor.pix(0, pixel_offset);
-                } else {
-                    this_slice = _pixelizor.stripe(pixel_offset, n_domain);
+    // ranges is [nbunch,ndomain,ndet][nrange]
+    // we loop in series over bunches, then in parallel over
+    // the domains in each bunch. This could be set up multiple
+    // ways. Here, we use 2 bunches. The first has nthread domains,
+    // and takes care of the samples that unambiguously hit a domain.
+    // The second bunch contains just one pseudo-domain for all the samples
+    // that straddle a domain edge. These will be done in series.
+    vector<vector<vector<RangesInt32>>> ranges(2);
+    auto & par_ranges = ranges[0];
+    auto & ser_ranges = ranges[1];
+    auto empty_range = vector<RangesInt32>(n_det, RangesInt32(n_time));
+    for (int i=0; i<n_domain; i++)
+        par_ranges.push_back(empty_range);
+    ser_ranges.push_back(empty_range);
+
+    // Find the domain for each det-sample and build ranges of consecutive
+    // samples with the same domain
+    #pragma omp parallel for
+    for (int i_det = 0; i_det < n_det; ++i_det) {
+        double dofs[4];
+        pointer.InitPerDet(i_det, dofs);
+        int last_domain = -1;
+        int domain_start = 0;
+        int pixinds[P::interp_count][P::index_count] = {-1};
+        FSIGNAL pixweights[P::interp_count];
+        for (int i_time = 0; i_time < n_time; ++i_time) {
+            double coords[4];
+            pointer.GetCoords(i_det, i_time, (double*)dofs, (double*)coords);
+            // Get all the pixels this sample hits
+            int n_point = _pixelizor.GetPixels(i_det, i_time, coords, pixinds, pixweights);
+            // Assign a domain to this sample. If all pixels in this sample hit the same domain,
+            // then this will be assigned to samp_domain. If the domain is ambiguous, then it will
+            // be set to n_domain.
+            int samp_domain = -1;
+            for(int i_point = 0; i_point < n_point; ++i_point) {
+                int point_domain = from_map ? _pixelizor.pix(0, pixinds[i_point])[0] : _pixelizor.stripe(pixinds[i_point], n_domain);
+                if(i_point > 0 && point_domain != samp_domain) {
+                    // This is a mixed-domain sample. Put it in the serial catagory
+                    samp_domain = n_domain;
+                    break;
                 }
-                if (this_slice != last_slice) {
-                    if (last_slice >= 0)
-                        ranges[last_slice][i_det].append_interval_no_check(
-                            slice_start, i_time);
-                    slice_start = i_time;
-                    last_slice = this_slice;
+                samp_domain = point_domain;
+            }
+            // Did the domain status change?
+            if (samp_domain != last_domain) {
+                // Avoid triggering on the first sample, before we have established any domain
+                if (last_domain >= 0) {
+                    // Assign to either a parallel domain or the serial misc category
+                    auto & target_ranges = last_domain < n_domain ? par_ranges[last_domain] : ser_ranges[0];
+                    target_ranges[i_det].append_interval_no_check(domain_start, i_time);
                 }
+                domain_start = i_time;
+                last_domain  = samp_domain;
             }
-            if (last_slice >= 0)
-                ranges[last_slice][i_det].append_interval_no_check(
-                    slice_start, n_time);
+        }
+        // Close the last interval
+        if (last_domain >= 0) {
+            auto & target_ranges = last_domain < n_domain ? par_ranges[last_domain] : ser_ranges[0];
+            target_ranges[i_det].append_interval_no_check(domain_start, n_time);
         }
     }
 
     // Convert super vector to a list and return
-    auto ivals_out = bp::list();
-    for (int j=0; j<ranges.size(); j++) {
-        auto ivals = bp::list();
-        for (int i_det=0; i_det<n_det; i_det++) {
-            auto iv = ranges[j][i_det];
-            ivals.append(bp::object(iv));
+    auto ivals = bp::list();
+    for (int i=0; i<ranges.size(); i++) {
+        auto bunches = bp::list();
+        for (int j=0; j<ranges[i].size(); j++) {
+            auto domains = bp::list();
+            for (int i_det=0; i_det<n_det; i_det++) {
+                auto iv = ranges[i][j][i_det];
+                domains.append(bp::object(iv));
+            }
+            bunches.append(bp::extract<bp::object>(domains)());
         }
-        ivals_out.append(bp::extract<bp::object>(ivals)());
+        ivals.append(bp::extract<bp::object>(bunches)());
     }
-    return bp::extract<bp::object>(ivals_out);
+    return bp::extract<bp::object>(ivals);
 }
 
 template<typename C, typename P, typename S>
@@ -1022,7 +1184,8 @@ vector<int> ProjectionEngine<C,P,S>::tile_hits(
         for (int i_det = 0; i_det < n_det; ++i_det) {
             double dofs[4];
             pointer.InitPerDet(i_det, dofs);
-            int pixel_offset[P::index_count] = {-1};
+            int pixinds[P::interp_count][P::index_count] = {-1};
+            FSIGNAL pixweights[P::interp_count];
             int thread = 0;
             #ifdef _OPENMP
             thread = omp_get_thread_num();
@@ -1030,9 +1193,9 @@ vector<int> ProjectionEngine<C,P,S>::tile_hits(
             for (int i_time = 0; i_time < n_time; ++i_time) {
                 double coords[4];
                 pointer.GetCoords(i_det, i_time, (double*)dofs, (double*)coords);
-                _pixelizor.GetPixel(i_det, i_time, (double*)coords, pixel_offset);
-                if (pixel_offset[0] >= 0)
-                    temp[thread][pixel_offset[0]]++;
+                int n_point = _pixelizor.GetPixels(i_det, i_time, coords, pixinds, pixweights);
+                for(int i_point = 0; i_point < n_point; ++i_point)
+                    temp[thread][pixinds[i_point][0]]++;
             }
         }
 #pragma omp single
@@ -1064,6 +1227,7 @@ bp::object ProjectionEngine<C,P,S>::tile_ranges(
     int n_tile = _pixelizor.tile_count();
     if (n_tile < 0)
         throw general_exception("No tiles in this pixelization.");
+    int n_domain = bp::len(tile_lists);
 
     // Make a vector that maps tile into thread.
     vector<int> thread_idx(n_tile, -1);
@@ -1075,59 +1239,79 @@ bp::object ProjectionEngine<C,P,S>::tile_ranges(
        }
     }
 
-    // Make a bunch of Ranges, index them with [thread, det].  Note
-    // thread isn't the OMP thread index, below, but the thread_index
-    // determined from tile_lists.  It's safe to populate these
-    // structures in parallel provided that only one thread touches
-    // each Ranges object, which is the case since each OMP thread
-    // below specializes in certain detectors.
-    vector<vector<RangesInt32>> ranges;
-    for (int i=0; i<bp::len(tile_lists); i++) {
-        vector<RangesInt32> v(n_det);
-        for (auto &_v: v)
-            _v.count = n_time;
-        ranges.push_back(v);
-    }
+    // ranges is [nbunch,ndomain,ndet][nrange]
+    // we loop in series over bunches, then in parallel over
+    // the domains in each bunch. This could be set up multiple
+    // ways. Here, we use 2 bunches. The first has nthread domains,
+    // and takes care of the samples that unambiguously hit a domain.
+    // The second bunch contains just one pseudo-domain for all the samples
+    // that straddle a domain edge. These will be done in series.
+    vector<vector<vector<RangesInt32>>> ranges(2);
+    auto & par_ranges = ranges[0];
+    auto & ser_ranges = ranges[1];
+    auto empty_range = vector<RangesInt32>(n_det, RangesInt32(n_time));
+    for (int i=0; i<n_domain; i++)
+        par_ranges.push_back(empty_range);
+    ser_ranges.push_back(empty_range);
 
-#pragma omp parallel for
+    #pragma omp parallel for
     for (int i_det = 0; i_det < n_det; ++i_det) {
         double dofs[4];
         pointer.InitPerDet(i_det, dofs);
-        int last_slice = -1;
-        int slice_start = 0;
-        int pixel_offset[P::index_count] = {-1};
+        int last_domain = -1;
+        int domain_start = 0;
+        int pixinds[P::interp_count][P::index_count] = {-1};
+        FSIGNAL pixweights[P::interp_count];
         for (int i_time = 0; i_time < n_time; ++i_time) {
             double coords[4];
             pointer.GetCoords(i_det, i_time, (double*)dofs, (double*)coords);
-            _pixelizor.GetPixel(i_det, i_time, (double*)coords, pixel_offset);
-
-            int this_slice = -1;
-            if (pixel_offset[0] >= 0)
-                this_slice = thread_idx[pixel_offset[0]];
-            if (this_slice != last_slice) {
-                if (last_slice >= 0)
-                    ranges[last_slice][i_det].append_interval_no_check(
-                        slice_start, i_time);
-                slice_start = i_time;
-                last_slice = this_slice;
+            // set samp_domain from thread_idx if all points for this sample agree, otherwise
+            // set it to n_domain
+            int samp_domain = -1;
+            int n_point = _pixelizor.GetPixels(i_det, i_time, coords, pixinds, pixweights);
+            for(int i_point = 0; i_point < n_point; ++i_point) {
+                int point_domain = thread_idx[pixinds[i_point][0]];
+                if(i_point > 0 && point_domain != samp_domain) {
+                    // This is a mixed-domain sample. Put it in the serial catagory
+                    samp_domain = n_domain;
+                    break;
+                }
+                samp_domain = point_domain;
             }
+            // Did the domain status change?
+            if (samp_domain != last_domain) {
+                // Avoid triggering on the first sample, before we have established any domain
+                if (last_domain >= 0) {
+                    // Assign to either a parallel domain or the serial misc category
+                    auto & target_ranges = last_domain < n_domain ? par_ranges[last_domain] : ser_ranges[0];
+                    target_ranges[i_det].append_interval_no_check(domain_start, i_time);
+                }
+                domain_start = i_time;
+                last_domain  = samp_domain;
+            }
+        }
+        // Close the last interval
+        if (last_domain >= 0) {
+            auto & target_ranges = last_domain < n_domain ? par_ranges[last_domain] : ser_ranges[0];
+            target_ranges[i_det].append_interval_no_check(domain_start, n_time);
         }
-        if (last_slice >= 0)
-            ranges[last_slice][i_det].append_interval_no_check(
-                slice_start, n_time);
     }
 
-    // Convert vector<vector<Ranges>> to a list and return it.
-    auto ivals_out = bp::list();
-    for (int j=0; j<ranges.size(); j++) {
-        auto ivals = bp::list();
-        for (int i_det=0; i_det<n_det; i_det++) {
-            auto iv = ranges[j][i_det];
-            ivals.append(bp::object(iv));
+    // Convert super vector to a list and return
+    auto ivals = bp::list();
+    for (int i=0; i<ranges.size(); i++) {
+        auto bunches = bp::list();
+        for (int j=0; j<ranges[i].size(); j++) {
+            auto domains = bp::list();
+            for (int i_det=0; i_det<n_det; i_det++) {
+                auto iv = ranges[i][j][i_det];
+                domains.append(bp::object(iv));
+            }
+            bunches.append(bp::extract<bp::object>(domains)());
         }
-        ivals_out.append(bp::extract<bp::object>(ivals)());
+        ivals.append(bp::extract<bp::object>(bunches)());
     }
-    return bp::extract<bp::object>(ivals_out);
+    return bp::extract<bp::object>(ivals);
 }
 
 template<typename C, typename P, typename S>
@@ -1174,30 +1358,29 @@ bp::object ProjectionEngine<C,P,S>::from_map(
     for (int i_det = 0; i_det < n_det; ++i_det) {
         double dofs[4];
         pointer.InitPerDet(i_det, dofs);
-        int pixel_offset[P::index_count] = {-1};
+        int pixinds[P::interp_count][P::index_count] = {-1};
+        FSIGNAL pixweights[P::interp_count];
         FSIGNAL pf[S::comp_count];
         for (int i_time = 0; i_time < n_time; ++i_time) {
             double coords[4];
             pointer.GetCoords(i_det, i_time, (double*)dofs, (double*)coords);
-            _pixelizor.GetPixel(i_det, i_time, (double*)coords, pixel_offset);
-            if (pixel_offset[0] < 0) continue;
             spin_proj_factors<S>(coords, pf);
-            FSIGNAL *sig = (_signalspace.data_ptr[i_det] +
-                            _signalspace.steps[0]*i_time);
-            for (int imap=0; imap<S::comp_count; ++imap)
-                *sig += *_pixelizor.pix(imap, pixel_offset) * pf[imap];
+            FSIGNAL *sig = (_signalspace.data_ptr[i_det] + _signalspace.steps[0]*i_time);
+            int n_point = _pixelizor.GetPixels(i_det, i_time, coords, pixinds, pixweights);
+            for(int i_point = 0; i_point < n_point; ++i_point)
+                for (int imap=0; imap<S::comp_count; ++imap)
+                    *sig += *_pixelizor.pix(imap, pixinds[i_point]) * pf[imap] * pixweights[i_point];
         }
     }
 
     return _signalspace.ret_val;
 }
 
-
 template<typename C, typename P, typename S>
 static
 void to_map_single_thread(Pointer<C> &pointer,
                           P &_pixelizor,
-                          vector<RangesInt32> ivals,
+                          const vector<RangesInt32> & ivals,
                           BufferWrapper<FSIGNAL> &_det_weights,
                           SignalSpace<FSIGNAL> *_signalspace)
 {
@@ -1209,19 +1392,22 @@ void to_map_single_thread(Pointer<C> &pointer,
                                  _det_weights->strides[0]*i_det);
         double dofs[4];
         double coords[4];
-        int pixel_offset[P::index_count] = {-1};
         FSIGNAL pf[S::comp_count];
         pointer.InitPerDet(i_det, dofs);
+        // Pointing matrix interpolation stuff
+        int pixinds[P::interp_count][P::index_count] = {-1};
+        FSIGNAL pixweights[P::interp_count] = {0};
         for (auto const &rng: ivals[i_det].segments) {
             for (int i_time = rng.first; i_time < rng.second; ++i_time) {
                 pointer.GetCoords(i_det, i_time, (double*)dofs, (double*)coords);
-                _pixelizor.GetPixel(i_det, i_time, (double*)coords, pixel_offset);
-                if (pixel_offset[0] < 0) continue;
-                const FSIGNAL sig = *(_signalspace->data_ptr[i_det] +
-                                      _signalspace->steps[0]*i_time);
                 spin_proj_factors<S>(coords, pf);
-                for (int imap=0; imap<S::comp_count; ++imap)
-                    *_pixelizor.pix(imap, pixel_offset) += sig * pf[imap] * det_wt;
+                const FSIGNAL sig = _signalspace->data_ptr[i_det][_signalspace->steps[0]*i_time];
+                // In interpolated mapmaking like bilinear mampamking, each sample hits multipe
+                // pixels, each with its own weight.
+                int n_point = _pixelizor.GetPixels(i_det, i_time, coords, pixinds, pixweights);
+                for(int i_point = 0; i_point < n_point; ++i_point)
+                    for (int i_map=0; i_map<S::comp_count; ++i_map)
+                        *_pixelizor.pix(i_map, pixinds[i_point]) += sig * pf[i_map] * pixweights[i_point] * det_wt;
             }
         }
     }
@@ -1242,25 +1428,32 @@ void to_weight_map_single_thread(Pointer<C> &pointer,
                                  _det_weights->strides[0]*i_det);
         double dofs[4];
         double coords[4];
-        int pixel_offset[P::index_count] = {-1};
         FSIGNAL pf[S::comp_count];
         pointer.InitPerDet(i_det, dofs);
+        int pixinds[P::interp_count][P::index_count] = {-1};
+        FSIGNAL pixweights[P::interp_count] = {0};
         for (auto const &rng: ivals[i_det].segments) {
             for (int i_time = rng.first; i_time < rng.second; ++i_time) {
                 pointer.GetCoords(i_det, i_time, (double*)dofs, (double*)coords);
-                _pixelizor.GetPixel(i_det, i_time, (double*)coords, pixel_offset);
-                if (pixel_offset[0] < 0) continue;
                 spin_proj_factors<S>(coords, pf);
-                for (int imap=0; imap<S::comp_count; ++imap)
-                    for (int jmap=imap; jmap<S::comp_count; ++jmap)
-                        *_pixelizor.wpix(imap, jmap, pixel_offset) += pf[imap] * pf[jmap] * det_wt;
+
+                int n_point = _pixelizor.GetPixels(i_det, i_time, coords, pixinds, pixweights);
+                // Is this enough? Do we need a double loop over i_point?
+                // The weight map is supposed to be the diagonal of P'WP
+                // P_pi W_ii P_ip = P_pi**2 * W_ii. So we just need to square the pixweight.
+                // The i_point,j_point stuff would be off-diagonal elements which are outside the
+                // scope of this function
+                for(int i_point = 0; i_point < n_point; ++i_point)
+                    for (int imap=0; imap<S::comp_count; ++imap)
+                        for (int jmap=imap; jmap<S::comp_count; ++jmap)
+                            *_pixelizor.wpix(imap, jmap, pixinds[i_point]) += pf[imap] * pf[jmap] * pixweights[i_point] * pixweights[i_point] * det_wt;
             }
         }
     }
 }
 
 static
-vector<vector<RangesInt32>> derive_ranges(
+vector<vector<vector<RangesInt32>>> derive_ranges(
     bp::object thread_intervals, int n_det, int n_time,
     std::string arg_name)
 {
@@ -1273,37 +1466,55 @@ vector<vector<RangesInt32>> derive_ranges(
     // - List of Ranges: This will be promoted to a single thread.
     // - List containing N lists of Ranges: N threads, with each list
     //   used by corresponding thread.
+    // - List containing M lists of N[M] lists of Ranges: M bunches
+    //   which will be looped over in serial, each containing N[M]
+    //   threads-lists that will be done in parallel.
+    vector<vector<vector<RangesInt32>>> ivals;
 
-    vector<vector<RangesInt32>> ivals;
     if (isNone(thread_intervals)) {
+        // It's None. Generate a single bunch with a single-thread covering all samples
         auto r = RangesInt32(n_time).add_interval(0, n_time);
-        vector<RangesInt32> v(n_det, r);
+        vector<vector<RangesInt32>> v(1, vector<RangesInt32>(n_det, r));
         ivals.push_back(v);
+    } else if(bp::extract<RangesInt32>(thread_intervals[0]).check()) {
+        // It's a RangesMatrix (ndet,nranges). Promote to single thread, single bunch
+        ivals.push_back(vector<vector<RangesInt32>>(1, extract_ranges<int32_t>(thread_intervals)));
+    } else if(bp::extract<RangesInt32>(thread_intervals[0][0]).check()) {
+        // It's a per-thread RangesMatrix (nthread,ndet,nranges). Promote to single bunch
+        vector<vector<RangesInt32>> bunch;
+        for (int i=0; i<bp::len(thread_intervals); i++)
+            bunch.push_back(extract_ranges<int32_t>(thread_intervals[i]));
+        ivals.push_back(bunch);
+    } else if(bp::extract<RangesInt32>(thread_intervals[0][0][0]).check()) {
+        // It's a full multi-bunch (nbunch,nthread,ndet,nranges) thing.
+        for (int i=0; i<bp::len(thread_intervals); i++) {
+            vector<vector<RangesInt32>> bunch;
+            for (int j=0; j<bp::len(thread_intervals[i]); j++)
+                bunch.push_back(extract_ranges<int32_t>(thread_intervals[i][j]));
+            ivals.push_back(bunch);
+        }
     } else {
-        // Don't convert to a list... just use indexing.
-        if (bp::extract<RangesInt32>(thread_intervals[0]).check()) {
-            ivals.push_back(extract_ranges<int32_t>(thread_intervals));
-        } else {
-            // Assumed it is list of lists of ranges.
-            for (int i=0; i<bp::len(thread_intervals); i++)
-                ivals.push_back(extract_ranges<int32_t>(thread_intervals[i]));
+        // This should not happen
+        assert(false);
+    }
+    // Check that these all have the right shape. Maybe consider a standard
+    // for loop instead of foreach to give more useful error messages using
+    // the index
+    for(auto const & bunch: ivals)
+    for(auto const & ival: bunch) {
+        if (ival.size() != n_det) {
+            std::ostringstream err;
+            err << "Expected RangesMatrix with n_det=" << n_det
+                << " but encountered n_det=" << ival.size();
+            throw shape_exception(arg_name, err.str());
         }
-        // Check that these all have the right shape.
-        for (auto const &ival: ivals) {
-            if (ival.size() != n_det) {
+        for (auto const &ri: ival) {
+            if (ri.count != n_time) {
                 std::ostringstream err;
-                err << "Expected RangesMatrix with n_det=" << n_det
-                    << " but encountered n_det=" << ival.size();
+                err << "Expected RangesMatrix with n_time=" << n_time
+                    << " but encountered n_time=" << ri.count;
                 throw shape_exception(arg_name, err.str());
             }
-            for (auto const &ri: ival) {
-                if (ri.count != n_time) {
-                    std::ostringstream err;
-                    err << "Expected RangesMatrix with n_time=" << n_time
-                        << " but encountered n_time=" << ri.count;
-                    throw shape_exception(arg_name, err.str());
-                }
-            }
         }
     }
     return ivals;
@@ -1333,35 +1544,20 @@ bp::object ProjectionEngine<C,P,S>::to_map(
     auto _det_weights = BufferWrapper<FSIGNAL>(
          "det_weights", det_weights, true, vector<int>{n_det});
 
-    // For multi-threading, the principle here is that all threads
-    // loop over all detectors, but the sample ranges encoded in ivals
-    // are disjoint.
-    auto ivals = derive_ranges(thread_intervals, n_det, n_time,
+    // For multi-threading, the principle here is that we loop serially
+    // over bunches, and then inside each block all threads loop over
+    // all detectors in parallel, but the sample ranges are pixel-disjoint.
+    auto bunches = derive_ranges(thread_intervals, n_det, n_time,
                                "thread_intervals");
 
-    if (ivals.size() <= 1) {
-        // This block may also be used if OMP is disabled.
-        for (int i_thread=0; i_thread < ivals.size(); i_thread++)
-            to_map_single_thread<C,P,S>(
-                pointer, _pixelizor, ivals[i_thread],_det_weights, &_signalspace);
-    } else {
-#pragma omp parallel
-        {
-            // This loop construction handles the (sub-optimal) case
-            // that the number of ivals is not equal to the number of
-            // OMP threads.
-            int num_threads = 1;
-            int thread_num = 0;
-            #ifdef _OPENMP
-            num_threads = omp_get_num_threads();
-            thread_num = omp_get_thread_num();
-            #endif
-            for (int i_thread=0; i_thread < ivals.size(); i_thread++) {
-                if (i_thread % num_threads == thread_num)
-                    to_map_single_thread<C,P,S>(
-                        pointer, _pixelizor, ivals[i_thread], _det_weights, &_signalspace);
-            }
-        }
+    // First loop over serial bunches
+    for(int i_bunch = 0; i_bunch < bunches.size(); i_bunch++) {
+        const auto & ivals = bunches[i_bunch];
+        // Then loop over parallel bunches. This works even if omp is not enabled,
+        // or if ivals.size() == 1
+        #pragma omp parallel for
+        for (int i_thread = 0; i_thread < ivals.size(); i_thread++)
+            to_map_single_thread<C,P,S>(pointer, _pixelizor, ivals[i_thread], _det_weights, &_signalspace);
     }
     return map;
 }
@@ -1390,46 +1586,33 @@ bp::object ProjectionEngine<C,P,S>::to_weight_map(
     auto _det_weights = BufferWrapper<FSIGNAL>(
          "det_weights", det_weights, true, vector<int>{n_det});
 
-    // For multi-threading, the principle here is that all threads
-    // loop over all detectors, but the sample ranges encoded in ivals
-    // are disjoint.
-    auto ivals = derive_ranges(thread_intervals, n_det, n_time,
+    // For multi-threading, the principle here is that we loop serially
+    // over bunches, and then inside each block all threads loop over
+    // all detectors in parallel, but the sample ranges are pixel-disjoint.
+    auto bunches = derive_ranges(thread_intervals, n_det, n_time,
                                "thread_intervals");
 
-    if (ivals.size() <= 1) {
-        // This block may also be used if OMP is disabled.
-        for (int i_thread=0; i_thread < ivals.size(); i_thread++)
-            to_weight_map_single_thread<C,P,S>(
-                pointer, _pixelizor, ivals[i_thread], _det_weights);
-    } else {
-#pragma omp parallel
-        {
-            // This loop construction handles the (sub-optimal) case
-            // that the number of ivals is not equal to the number of
-            // OMP threads.
-            int num_threads = 1;
-            int thread_num = 0;
-            #ifdef _OPENMP
-            num_threads = omp_get_num_threads();
-            thread_num = omp_get_thread_num();
-            #endif
-            for (int i_thread=0; i_thread < ivals.size(); i_thread++) {
-                if (i_thread % num_threads == thread_num)
-                    to_weight_map_single_thread<C,P,S>(
-                        pointer, _pixelizor, ivals[i_thread], _det_weights);
-            }
-        }
+    // First loop over serial bunches
+    for(int i_bunch = 0; i_bunch < bunches.size(); i_bunch++) {
+        const auto & ivals = bunches[i_bunch];
+        // Then loop over parallel bunches. This works even if omp is not enabled,
+        // or if ivals.size() == 1
+        #pragma omp parallel for
+        for (int i_thread = 0; i_thread < ivals.size(); i_thread++)
+            to_weight_map_single_thread<C,P,S>(pointer, _pixelizor, ivals[i_thread], _det_weights);
     }
     return map;
 }
 
-
 // ProjEng_Precomp
 //
 // The ProjEng_Precomp may be used for very fast projection
 // operations, provided you have precomputed pixel index and spin
 // projection factors for every sample.  This is agnostic of
-// coordinate system, and so not crazily templated.
+// coordinate system, and so not crazily templated. It is
+// hard-coded to nearest neighbor interpolation though
+// (though one could emulate interpolation by calling it repeatedly
+// with different pixel indices and weights).
 //
 
 template<typename TilingSys>
@@ -1599,34 +1782,18 @@ bp::object ProjEng_Precomp<TilingSys>::to_map(
         throw shape_exception("pixel_index",
                               "Fast dimension of pixel indices must be close-packed.");
 
-    auto ivals = derive_ranges(thread_intervals, n_det, n_time,
-                               "thread_intervals");
+    auto bunches = derive_ranges(thread_intervals, n_det, n_time, "thread_intervals");
 
-    if (ivals.size() <= 1) {
-        // This block may also be used if OMP is disabled.
-        for (int i_thread=0; i_thread < ivals.size(); i_thread++)
+    // First loop over serial bunches
+    for(int i_bunch = 0; i_bunch < bunches.size(); i_bunch++) {
+        const auto & ivals = bunches[i_bunch];
+        // Then loop over parallel bunches. This works even if omp is not enabled,
+        // or if ivals.size() == 1
+        #pragma omp parallel for
+        for (int i_thread = 0; i_thread < ivals.size(); i_thread++)
             precomp_to_map_single_thread<TilingSys>(
                 pixelizor, pixel_buf_man, spin_proj_man,
-                ivals[i_thread],_det_weights, &_signalspace);
-    } else {
-#pragma omp parallel
-        {
-            // This loop construction handles the (sub-optimal) case
-            // that the number of ivals is not equal to the number of
-            // OMP threads.
-            int num_threads = 1;
-            int thread_num = 0;
-            #ifdef _OPENMP
-            num_threads = omp_get_num_threads();
-            thread_num = omp_get_thread_num();
-            #endif
-            for (int i_thread=0; i_thread < ivals.size(); i_thread++) {
-                if (i_thread % num_threads == thread_num)
-                    precomp_to_map_single_thread<TilingSys>(
-                        pixelizor, pixel_buf_man, spin_proj_man,
-                        ivals[i_thread], _det_weights, &_signalspace);
-            }
-        }
+                ivals[i_thread], _det_weights, &_signalspace);
     }
     return map;
 }
@@ -1662,34 +1829,18 @@ bp::object ProjEng_Precomp<TilingSys>::to_weight_map(
         throw shape_exception("pixel_index",
                               "Fast dimension of pixel indices must be close-packed.");
 
-    auto ivals = derive_ranges(thread_intervals, n_det, n_time,
-                               "thread_intervals");
+    auto bunches = derive_ranges(thread_intervals, n_det, n_time, "thread_intervals");
 
-    if (ivals.size() <= 1) {
-        // This block may also be used if OMP is disabled.
-        for (int i_thread=0; i_thread < ivals.size(); i_thread++)
+    // First loop over serial bunches
+    for(int i_bunch = 0; i_bunch < bunches.size(); i_bunch++) {
+        const auto & ivals = bunches[i_bunch];
+        // Then loop over parallel bunches. This works even if omp is not enabled,
+        // or if ivals.size() == 1
+        #pragma omp parallel for
+        for (int i_thread = 0; i_thread < ivals.size(); i_thread++)
             precomp_to_weight_map_single_thread<TilingSys>(
                 pixelizor, pixel_buf_man, spin_proj_man,
-                ivals[i_thread],_det_weights);
-    } else {
-#pragma omp parallel
-        {
-            // This loop construction handles the (sub-optimal) case
-            // that the number of ivals is not equal to the number of
-            // OMP threads.
-            int num_threads = 1;
-            int thread_num = 0;
-            #ifdef _OPENMP
-            num_threads = omp_get_num_threads();
-            thread_num = omp_get_thread_num();
-            #endif
-            for (int i_thread=0; i_thread < ivals.size(); i_thread++) {
-                if (i_thread % num_threads == thread_num)
-                    precomp_to_weight_map_single_thread<TilingSys>(
-                        pixelizor, pixel_buf_man, spin_proj_man,
-                        ivals[i_thread], _det_weights);
-            }
-        }
+                ivals[i_thread], _det_weights);
     }
     return map;
 }
@@ -1698,10 +1849,13 @@ typedef ProjEng_Precomp<NonTiled> ProjEng_Precomp_NonTiled;
 typedef ProjEng_Precomp<Tiled> ProjEng_Precomp_Tiled;
 
 #define PROJENG(PIX, SPIN, TILING) ProjEng_##PIX##_##SPIN##_##TILING
+#define PROJENG_INTERP(PIX, SPIN, TILING, INTERP) ProjEng_##PIX##_##SPIN##_##TILING##_##INTERP
 
-#define TYPEDEF_TILING(PIX, SPIN, TILING)                               \
+#define TYPEDEF_TILING(PIX, SPIN, TILING) \
     typedef ProjectionEngine<Proj ## PIX,Pixelizor2_Flat<TILING>,Spin##SPIN> \
-        PROJENG(PIX, SPIN, TILING);
+        PROJENG(PIX, SPIN, TILING); \
+    typedef ProjectionEngine<Proj ## PIX,Pixelizor2_Flat<TILING,Bilinear>,Spin##SPIN> \
+        PROJENG_INTERP(PIX, SPIN, TILING, Bilinear);
 
 #define TYPEDEF_SPIN(PIX, SPIN)                 \
     TYPEDEF_TILING(PIX, SPIN, Tiled)            \
@@ -1740,7 +1894,8 @@ TYPEDEF_PIX(ZEA)
 
 
 #define EXPORT_TILING(PIX, SPIN, TILING)        \
-    EXPORT_ENGINE(PROJENG(PIX, SPIN, TILING))
+    EXPORT_ENGINE(PROJENG(PIX, SPIN, TILING)) \
+    EXPORT_ENGINE(PROJENG_INTERP(PIX, SPIN, TILING, Bilinear))
 
 #define EXPORT_SPIN(PIX, SPIN)                                          \
     EXPORT_TILING(PIX, SPIN, Tiled)                                     \
diff --git a/test/test_proj_eng.py b/test/test_proj_eng.py
index 18c255bc..95652d5a 100644
--- a/test/test_proj_eng.py
+++ b/test/test_proj_eng.py
@@ -103,13 +103,14 @@ def test_20_threads(self):
                 continue
             else:
                 threads = p.assign_threads(asm, method=method, n_threads=n_threads)
-            self.assertEqual(threads.shape, (n_threads,) + sig.shape,
+            # This may need to be generalized if we implement fancier threads schemes.
+            self.assertEqual(threads[0].shape, (n_threads,) + sig.shape,
                              msg=f'threads has wrong shape ({detail})')
 
             # Make sure the threads cover the TOD, or not,
-            # depending on clipped.
-            counts = np.zeros(threads.shape[1:], int)
-            for t in threads:
+            # depending on clipped. This may also need generalization
+            counts = np.zeros(threads[0].shape[1:], int)
+            for t in threads[0]:
                 counts += t.mask()
             target = set([0,1]) if clipped else set([1])
             self.assertEqual(set(counts.ravel()), target,