-
Notifications
You must be signed in to change notification settings - Fork 24
/
distributed-batch-gemm.cpp
198 lines (180 loc) · 7.56 KB
/
distributed-batch-gemm.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
/***************************************************************************
*
* Copyright (C) Codeplay Software Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Codeplay's SYCL-For-CUDA-Examples
*
* distributed-batch-gemm.cpp
*
* Description:
* Matrix multiplication distributed across devices using MPI
**************************************************************************/
#include <CL/sycl.hpp>
#include <CL/sycl/backend/cuda.hpp>
#include <algorithm>
#include <mpi.h>
#include <mpi-ext.h>
#include <numeric>
#include <stdio.h>
#include <stdlib.h>
#include <sycl_blas.h>
#define PRINT_DEBUG_MODE 1
int main(int argc, char **argv) {
/* Create a SYCL queue with the default device selector */
sycl::queue q(cl::sycl::gpu_selector{});
/* -------------------------------------------------------------------------------------------
Check to see if MPI library is CUDA-aware
--------------------------------------------------------------------------------------------*/
printf("Run time check:\n");
#if defined(MPIX_CUDA_AWARE_SUPPORT)
if (1 == MPIX_Query_cuda_support()) {
printf("This MPI library has CUDA-aware support.\n");
} else {
printf("This MPI library does not have CUDA-aware support.\n");
}
#else /* !defined(MPIX_CUDA_AWARE_SUPPORT) */
printf("This MPI library cannot determine if there is CUDA-aware support.\n");
#endif /* MPIX_CUDA_AWARE_SUPPORT */
/* -------------------------------------------------------------------------------------------
MPI Initialization
--------------------------------------------------------------------------------------------*/
MPI_Init(&argc, &argv);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (size != 2) {
if (rank == 0) {
printf(
"This program requires exactly 2 MPI ranks, but you are "
"attempting to use %d! Exiting...\n",
size);
}
MPI_Finalize();
exit(0);
}
double start_time, stop_time, elapsed_time;
/* Create a SYCL-BLAS executor and get the policy handler */
blas::Executor<blas::PolicyHandler<blas::codeplay_policy>> executor(q);
auto policy_handler = executor.get_policy_handler();
/* Arguments of the Gemm operation.
* Note: these matrix dimensions are too small to get a performance gain by
* using SYCL-BLAS, but they are convenient for this sample */
const int m = 32;
const int k = 32;
const int n = 32;
const int lda = m;
const int ldb = k;
const int ldc = m;
const float alpha = 1;
const float beta = 0;
const float batch = 2;
/* creating local buffer */
auto local_a_gpu = blas::make_sycl_iterator_buffer<float>(lda * k);
auto local_b_gpu = blas::make_sycl_iterator_buffer<float>(ldb * n);
auto local_c_gpu = blas::make_sycl_iterator_buffer<float>(ldc * n);
/* Create the global buffer */
auto global_a_gpu = blas::make_sycl_iterator_buffer<float>(batch * lda * k);
auto global_b_gpu = blas::make_sycl_iterator_buffer<float>(batch * ldb * n);
auto global_c_gpu = blas::make_sycl_iterator_buffer<float>(batch * ldc * n);
if (rank == 0) {
// Setting buffer value for A and B
std::vector<float> A = std::vector<float>(batch * lda * k, float(1.0));
std::vector<float> B = std::vector<float>(batch * ldb * n, float(1.0));
policy_handler.copy_to_device(A.data(), global_a_gpu, batch * lda * k);
policy_handler.copy_to_device(B.data(), global_b_gpu, batch * ldb * n);
}
/* -------------------------------------------------------------------------------------------
Create an SYCL interoperability with CUDA to scatter the data each batch A,
B among the two MPI process
--------------------------------------------------------------------------------------------*/
start_time = MPI_Wtime();
auto ht_a = [&](sycl::handler &h) {
auto global_a_acc =
global_a_gpu.get_buffer().template get_access<sycl::access::mode::read>(
h);
auto local_a_acc =
local_a_gpu.get_buffer().template get_access<sycl::access::mode::write>(
h);
h.interop_task([=](sycl::interop_handle ih) {
auto global_a_ptr = reinterpret_cast<float *>(
ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(global_a_acc));
auto local_a_ptr = reinterpret_cast<float *>(
ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(local_a_acc));
MPI_Scatter(global_a_ptr, lda * k, MPI_FLOAT, local_a_ptr, lda * k,
MPI_FLOAT, 0, MPI_COMM_WORLD);
});
};
q.submit(ht_a);
auto ht_b = [&](sycl::handler &h) {
auto global_b_acc =
global_b_gpu.get_buffer().template get_access<sycl::access::mode::read>(
h);
auto local_b_acc =
local_b_gpu.get_buffer().template get_access<sycl::access::mode::write>(
h);
h.interop_task([=](sycl::interop_handle ih) {
auto global_b_ptr = reinterpret_cast<float *>(
ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(global_b_acc));
auto local_b_ptr = reinterpret_cast<float *>(
ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(local_b_acc));
MPI_Scatter(global_b_ptr, ldb * n, MPI_FLOAT, local_b_ptr, ldb * n,
MPI_FLOAT, 0, MPI_COMM_WORLD);
});
};
q.submit(ht_b);
q.wait_and_throw();
/* Execute the GEMM operation */
auto event = blas::_gemm(executor, 'n', 'n', m, n, k, alpha, local_a_gpu, lda,
local_b_gpu, ldb, beta, local_c_gpu, ldc);
policy_handler.wait(event);
/* -------------------------------------------------------------------------------------------
Create a SYCL interoperability with CUDA to replace the original input with
normalized value
--------------------------------------------------------------------------------------------*/
auto ht_c = [&](sycl::handler &h) {
auto global_c_acc = global_c_gpu.get_buffer()
.template get_access<sycl::access::mode::write>(h);
auto local_c_acc =
local_c_gpu.get_buffer().template get_access<sycl::access::mode::read>(
h);
h.interop_task([=](sycl::interop_handle ih) {
auto local_c_ptr = reinterpret_cast<float *>(
ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(local_c_acc));
auto global_c_ptr = reinterpret_cast<float *>(
ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(global_c_acc));
MPI_Gather(local_c_ptr, ldc * n, MPI_FLOAT, global_c_ptr, ldc * n,
MPI_FLOAT, 0, MPI_COMM_WORLD);
});
};
q.submit(ht_c);
q.wait_and_throw();
stop_time = MPI_Wtime();
elapsed_time = stop_time - start_time;
/* -------------------------------------------------------------------------------------------
Print the output
--------------------------------------------------------------------------------------------*/
if (rank == 0) {
std::cout << "elapsed_time" << elapsed_time;
#if defined(PRINT_DEBUG_MODE)
auto C = global_c_gpu.get_buffer().get_host_access();
for (int i = 0; i < batch * ldc * n; i++) {
std::cout << " value at " << i << " : " << C[i] << "\n";
}
#endif
}
MPI_Finalize();
return 0;
}