forked from NVIDIA/cuda-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvectorAddMMAP.cpp
325 lines (268 loc) · 10.2 KB
/
vectorAddMMAP.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Vector addition: C = A + B.
*
* This sample replaces the device allocation in the vectorAddDrvsample with
* cuMemMap-ed allocations. This sample demonstrates that the cuMemMap api
* allows the user to specify the physical properties of their memory while
* retaining the contiguos nature of their access, thus not requiring a change
* in their program structure.
*
*/
// Includes
#include <cuda.h>
#include <stdio.h>
#include <string.h>
#include <cstring>
#include <iostream>
// includes, project
#include <helper_cuda_drvapi.h>
#include <helper_functions.h>
// includes, CUDA
#include <builtin_types.h>
#include "multidevicealloc_memmap.hpp"
using namespace std;
// Variables
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction vecAdd_kernel;
float *h_A;
float *h_B;
float *h_C;
CUdeviceptr d_A;
CUdeviceptr d_B;
CUdeviceptr d_C;
size_t allocationSize = 0;
// Functions
int CleanupNoFailure();
void RandomInit(float *, int);
bool findModulePath(const char *, string &, char **, string &);
// define input ptx file for different platforms
#if defined(_WIN64) || defined(__LP64__)
#define PTX_FILE "vectorAdd_kernel64.ptx"
#else
#define PTX_FILE "vectorAdd_kernel32.ptx"
#endif
// collect all of the devices whose memory can be mapped from cuDevice.
vector<CUdevice> getBackingDevices(CUdevice cuDevice) {
int num_devices;
checkCudaErrors(cuDeviceGetCount(&num_devices));
vector<CUdevice> backingDevices;
backingDevices.push_back(cuDevice);
for (int dev = 0; dev < num_devices; dev++) {
int capable = 0;
int attributeVal = 0;
// The mapping device is already in the backingDevices vector
if (dev == cuDevice) {
continue;
}
// Only peer capable devices can map each others memory
checkCudaErrors(cuDeviceCanAccessPeer(&capable, cuDevice, dev));
if (!capable) {
continue;
}
// The device needs to support virtual address management for the required
// apis to work
checkCudaErrors(cuDeviceGetAttribute(
&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
cuDevice));
if (attributeVal == 0) {
continue;
}
backingDevices.push_back(dev);
}
return backingDevices;
}
// Host code
int main(int argc, char **argv) {
printf("Vector Addition (Driver API)\n");
int N = 50000;
size_t size = N * sizeof(float);
int attributeVal = 0;
// Initialize
checkCudaErrors(cuInit(0));
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// Check that the selected device supports virtual address management
checkCudaErrors(cuDeviceGetAttribute(
&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
cuDevice));
printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice,
attributeVal);
if (attributeVal == 0) {
printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice);
exit(EXIT_WAIVED);
}
// The vector addition happens on cuDevice, so the allocations need to be
// mapped there.
vector<CUdevice> mappingDevices;
mappingDevices.push_back(cuDevice);
// Collect devices accessible by the mapping device (cuDevice) into the
// backingDevices vector.
vector<CUdevice> backingDevices = getBackingDevices(cuDevice);
// Create context
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
// first search for the module path before we load the results
string module_path, ptx_source;
if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) {
if (!findModulePath("vectorAdd_kernel.cubin", module_path, argv,
ptx_source)) {
printf("> findModulePath could not find <vectorAdd> ptx or cubin\n");
exit(EXIT_FAILURE);
}
} else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
}
// Create module from binary file (PTX or CUBIN)
if (module_path.rfind("ptx") != string::npos) {
// in this branch we use compilation with parameters
const unsigned int jitNumOptions = 3;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void *[jitNumOptions];
// set up size of compilation log buffer
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = 1024;
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
// set up pointer to the compilation log buffer
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[1] = jitLogBuffer;
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[2] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 32;
jitOptVals[2] = (void *)(size_t)jitRegCount;
checkCudaErrors(cuModuleLoadDataEx(&cuModule, ptx_source.c_str(),
jitNumOptions, jitOptions,
(void **)jitOptVals));
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
} else {
checkCudaErrors(cuModuleLoad(&cuModule, module_path.c_str()));
}
// Get function handle from module
checkCudaErrors(
cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
// Allocate input vectors h_A and h_B in host memory
h_A = (float *)malloc(size);
h_B = (float *)malloc(size);
h_C = (float *)malloc(size);
// Initialize input vectors
RandomInit(h_A, N);
RandomInit(h_B, N);
// Allocate vectors in device memory
// note that a call to cuCtxEnablePeerAccess is not needed even though
// the backing devices and mapping device are not the same.
// This is because the cuMemSetAccess call explicitly specifies
// the cross device mapping.
// cuMemSetAccess is still subject to the constraints of cuDeviceCanAccessPeer
// for cross device mappings (hence why we checked cuDeviceCanAccessPeer
// earlier).
checkCudaErrors(simpleMallocMultiDeviceMmap(&d_A, &allocationSize, size,
backingDevices, mappingDevices));
checkCudaErrors(simpleMallocMultiDeviceMmap(&d_B, NULL, size, backingDevices,
mappingDevices));
checkCudaErrors(simpleMallocMultiDeviceMmap(&d_C, NULL, size, backingDevices,
mappingDevices));
// Copy vectors from host memory to device memory
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
// This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel Launch
// (simpler method)
// Grid/Block configuration
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
void *args[] = {&d_A, &d_B, &d_C, &N};
// Launch the CUDA kernel
checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
threadsPerBlock, 1, 1, 0, NULL, args, NULL));
// Copy result from device memory to host memory
// h_C contains the result in host memory
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
// Verify result
int i;
for (i = 0; i < N; ++i) {
float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - sum) > 1e-7f) {
break;
}
}
CleanupNoFailure();
printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
}
int CleanupNoFailure() {
// Free device memory
checkCudaErrors(simpleFreeMultiDeviceMmap(d_A, allocationSize));
checkCudaErrors(simpleFreeMultiDeviceMmap(d_B, allocationSize));
checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize));
// Free host memory
if (h_A) {
free(h_A);
}
if (h_B) {
free(h_B);
}
if (h_C) {
free(h_C);
}
checkCudaErrors(cuCtxDestroy(cuContext));
return EXIT_SUCCESS;
}
// Allocates an array with random float entries.
void RandomInit(float *data, int n) {
for (int i = 0; i < n; ++i) {
data[i] = rand() / (float)RAND_MAX;
}
}
bool inline findModulePath(const char *module_file, string &module_path,
char **argv, string &ptx_source) {
char *actual_path = sdkFindFilePath(module_file, argv[0]);
if (actual_path) {
module_path = actual_path;
} else {
printf("> findModulePath file not found: <%s> \n", module_file);
return false;
}
if (module_path.empty()) {
printf("> findModulePath could not find file: <%s> \n", module_file);
return false;
} else {
printf("> findModulePath found file at <%s>\n", module_path.c_str());
if (module_path.rfind(".ptx") != string::npos) {
FILE *fp = fopen(module_path.c_str(), "rb");
fseek(fp, 0, SEEK_END);
int file_size = ftell(fp);
char *buf = new char[file_size + 1];
fseek(fp, 0, SEEK_SET);
fread(buf, sizeof(char), file_size, fp);
fclose(fp);
buf[file_size] = '\0';
ptx_source = buf;
delete[] buf;
}
return true;
}
}