Skip to content

Commit

Permalink
[core] NUMA-aware pinned allocator
Browse files Browse the repository at this point in the history
Using cudaHostRegister/Unregister instead of cudaMallocHost to move memory to a
specific NUMA node
  • Loading branch information
Dmytro Dzhulgakov authored and dzhulgakov committed Mar 6, 2018
1 parent 7d8188a commit 496c999
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 26 deletions.
31 changes: 23 additions & 8 deletions caffe2/core/context_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/numa.h"
#include "caffe2/core/tensor.h"
#include "caffe2/core/types.h"
#include "caffe2/proto/caffe2.pb.h"
Expand Down Expand Up @@ -304,7 +305,14 @@ struct PinnedCPUAllocator final : CPUAllocator {
std::pair<void*, MemoryDeleter> New(size_t nbytes) override {
void* data;
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
if (IsNUMAEnabled()) {
auto ptr_and_deleter = baseAllocator_.New(nbytes);
data = ptr_and_deleter.first;
CAFFE_ENFORCE(data);
CUDA_ENFORCE(cudaHostRegister(data, nbytes, cudaHostRegisterDefault));
} else {
CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
}
memset(data, 0, nbytes);
return {data, Delete};
}
Expand All @@ -321,16 +329,23 @@ struct PinnedCPUAllocator final : CPUAllocator {
// But, if one calls CPUContext::New() before any cuda allocations,
// PinnedCPUAllocator can still delete the corresponding memory.
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
cudaError_t err = cudaFreeHost(data);
if (err == cudaErrorInvalidValue) {
free(data);
// Calling cudaGetLastError will reset the cuda error.
cudaGetLastError();
if (IsNUMAEnabled()) {
CUDA_ENFORCE(cudaHostUnregister(data));
DefaultCPUAllocator::Delete(data);
} else {
// For all other errors, still do a cuda check.
CUDA_ENFORCE(err);
cudaError_t err = cudaFreeHost(data);
if (err == cudaErrorInvalidValue) {
free(data);
// Calling cudaGetLastError will reset the cuda error.
cudaGetLastError();
} else {
// For all other errors, still do a cuda check.
CUDA_ENFORCE(err);
}
}
}

DefaultCPUAllocator baseAllocator_;
};

// For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
Expand Down
9 changes: 4 additions & 5 deletions caffe2/core/numa.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ CAFFE2_DEFINE_bool(
"Use NUMA whenever possible.");

#if defined(__linux__) && !defined(CAFFE2_DISABLE_NUMA) && CAFFE2_MOBILE == 0
#include <errno.h>
#include <numa.h>
#include <numaif.h>
#define CAFFE2_NUMA_ENABLED
Expand Down Expand Up @@ -94,15 +93,15 @@ void NUMAMove(void* ptr, size_t size, int numa_node_id) {
// Avoid extra dynamic allocation and NUMA api calls
CAFFE_ENFORCE(numa_node_id >= 0 && numa_node_id < sizeof(unsigned long) * 8);
unsigned long mask = 1UL << numa_node_id;
if (mbind(
CAFFE_ENFORCE(
mbind(
(void*)page_start_ptr,
size + offset,
MPOL_BIND,
&mask,
sizeof(mask) * 8,
MPOL_MF_MOVE | MPOL_MF_STRICT) != 0) {
LOG(ERROR) << "Could not move memory to a NUMA node: " << strerror(errno);
}
MPOL_MF_MOVE | MPOL_MF_STRICT) == 0,
"Could not move memory to a NUMA node");
}

int GetCurrentNUMANode() {
Expand Down
42 changes: 29 additions & 13 deletions caffe2/python/numa_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,39 @@
core.GlobalInit(["caffe2", "--caffe2_cpu_numa_enabled=1"])


def build_test_net(net_name):
net = core.Net(net_name)
net.Proto().type = "async_scheduling"

numa_device_option = caffe2_pb2.DeviceOption()
numa_device_option.device_type = caffe2_pb2.CPU
numa_device_option.numa_node_id = 0

net.ConstantFill([], "output_blob_0", shape=[1], value=3.14,
device_option=numa_device_option)

numa_device_option.numa_node_id = 1
net.ConstantFill([], "output_blob_1", shape=[1], value=3.14,
device_option=numa_device_option)

gpu_device_option = caffe2_pb2.DeviceOption()
gpu_device_option.device_type = caffe2_pb2.CUDA
gpu_device_option.cuda_gpu_id = 0

net.CopyCPUToGPU("output_blob_0", "output_blob_0_gpu",
device_option=gpu_device_option)
net.CopyCPUToGPU("output_blob_1", "output_blob_1_gpu",
device_option=gpu_device_option)

return net


@unittest.skipIf(not workspace.IsNUMAEnabled(), "NUMA is not enabled")
@unittest.skipIf(workspace.GetNumNUMANodes() < 2, "Not enough NUMA nodes")
@unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
class NUMATest(TestCase):
def test_numa(self):
net = core.Net("test_numa")
net.Proto().type = "async_scheduling"

numa_device_option = caffe2_pb2.DeviceOption()
numa_device_option.device_type = caffe2_pb2.CPU
numa_device_option.numa_node_id = 0

net.ConstantFill([], "output_blob_0", shape=[1], value=3.14,
device_option=numa_device_option)

numa_device_option.numa_node_id = 1
net.ConstantFill([], "output_blob_1", shape=[1], value=3.14,
device_option=numa_device_option)
net = build_test_net("test_numa")

workspace.RunNetOnce(net)

Expand Down

0 comments on commit 496c999

Please sign in to comment.