[core] NUMA-aware pinned allocator

Using cudaHostRegister/Unregister instead of cudaMallocHost to move memory to a specific NUMA node
walterddr · Mar 6, 2018 · 496c999 · 496c999
1 parent 7d8188a
commit 496c999
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 26 deletions.
diff --git a/caffe2/core/context_gpu.h b/caffe2/core/context_gpu.h
@@ -24,6 +24,7 @@
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
+#include "caffe2/core/numa.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/types.h"
 #include "caffe2/proto/caffe2.pb.h"
@@ -304,7 +305,14 @@ struct PinnedCPUAllocator final : CPUAllocator {
   std::pair<void*, MemoryDeleter> New(size_t nbytes) override {
     void* data;
     std::lock_guard<std::mutex> lock(CUDAContext::mutex());
-    CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
+    if (IsNUMAEnabled()) {
+      auto ptr_and_deleter = baseAllocator_.New(nbytes);
+      data = ptr_and_deleter.first;
+      CAFFE_ENFORCE(data);
+      CUDA_ENFORCE(cudaHostRegister(data, nbytes, cudaHostRegisterDefault));
+    } else {
+      CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
+    }
     memset(data, 0, nbytes);
     return {data, Delete};
   }
@@ -321,16 +329,23 @@ struct PinnedCPUAllocator final : CPUAllocator {
     // But, if one calls CPUContext::New() before any cuda allocations,
     // PinnedCPUAllocator can still delete the corresponding memory.
     std::lock_guard<std::mutex> lock(CUDAContext::mutex());
-    cudaError_t err = cudaFreeHost(data);
-    if (err == cudaErrorInvalidValue) {
-      free(data);
-      // Calling cudaGetLastError will reset the cuda error.
-      cudaGetLastError();
+    if (IsNUMAEnabled()) {
+      CUDA_ENFORCE(cudaHostUnregister(data));
+      DefaultCPUAllocator::Delete(data);
     } else {
-      // For all other errors, still do a cuda check.
-      CUDA_ENFORCE(err);
+      cudaError_t err = cudaFreeHost(data);
+      if (err == cudaErrorInvalidValue) {
+        free(data);
+        // Calling cudaGetLastError will reset the cuda error.
+        cudaGetLastError();
+      } else {
+        // For all other errors, still do a cuda check.
+        CUDA_ENFORCE(err);
+      }
     }
   }
+
+  DefaultCPUAllocator baseAllocator_;
 };
 
 // For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.

diff --git a/caffe2/core/numa.cc b/caffe2/core/numa.cc
@@ -22,7 +22,6 @@ CAFFE2_DEFINE_bool(
     "Use NUMA whenever possible.");
 
 #if defined(__linux__) && !defined(CAFFE2_DISABLE_NUMA) && CAFFE2_MOBILE == 0
-#include <errno.h>
 #include <numa.h>
 #include <numaif.h>
 #define CAFFE2_NUMA_ENABLED
@@ -94,15 +93,15 @@ void NUMAMove(void* ptr, size_t size, int numa_node_id) {
   // Avoid extra dynamic allocation and NUMA api calls
   CAFFE_ENFORCE(numa_node_id >= 0 && numa_node_id < sizeof(unsigned long) * 8);
   unsigned long mask = 1UL << numa_node_id;
-  if (mbind(
+  CAFFE_ENFORCE(
+      mbind(
           (void*)page_start_ptr,
           size + offset,
           MPOL_BIND,
           &mask,
           sizeof(mask) * 8,
-          MPOL_MF_MOVE | MPOL_MF_STRICT) != 0) {
-    LOG(ERROR) << "Could not move memory to a NUMA node: " << strerror(errno);
-  }
+          MPOL_MF_MOVE | MPOL_MF_STRICT) == 0,
+      "Could not move memory to a NUMA node");
 }
 
 int GetCurrentNUMANode() {

diff --git a/caffe2/python/numa_test.py b/caffe2/python/numa_test.py
@@ -10,23 +10,39 @@
 core.GlobalInit(["caffe2", "--caffe2_cpu_numa_enabled=1"])
 
 
+def build_test_net(net_name):
+    net = core.Net(net_name)
+    net.Proto().type = "async_scheduling"
+
+    numa_device_option = caffe2_pb2.DeviceOption()
+    numa_device_option.device_type = caffe2_pb2.CPU
+    numa_device_option.numa_node_id = 0
+
+    net.ConstantFill([], "output_blob_0", shape=[1], value=3.14,
+                         device_option=numa_device_option)
+
+    numa_device_option.numa_node_id = 1
+    net.ConstantFill([], "output_blob_1", shape=[1], value=3.14,
+                         device_option=numa_device_option)
+
+    gpu_device_option = caffe2_pb2.DeviceOption()
+    gpu_device_option.device_type = caffe2_pb2.CUDA
+    gpu_device_option.cuda_gpu_id = 0
+
+    net.CopyCPUToGPU("output_blob_0", "output_blob_0_gpu",
+                        device_option=gpu_device_option)
+    net.CopyCPUToGPU("output_blob_1", "output_blob_1_gpu",
+                        device_option=gpu_device_option)
+
+    return net
+
+
 @unittest.skipIf(not workspace.IsNUMAEnabled(), "NUMA is not enabled")
 @unittest.skipIf(workspace.GetNumNUMANodes() < 2, "Not enough NUMA nodes")
+@unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
 class NUMATest(TestCase):
     def test_numa(self):
-        net = core.Net("test_numa")
-        net.Proto().type = "async_scheduling"
-
-        numa_device_option = caffe2_pb2.DeviceOption()
-        numa_device_option.device_type = caffe2_pb2.CPU
-        numa_device_option.numa_node_id = 0
-
-        net.ConstantFill([], "output_blob_0", shape=[1], value=3.14,
-                             device_option=numa_device_option)
-
-        numa_device_option.numa_node_id = 1
-        net.ConstantFill([], "output_blob_1", shape=[1], value=3.14,
-                             device_option=numa_device_option)
+        net = build_test_net("test_numa")
 
         workspace.RunNetOnce(net)