forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
autograd_not_implemented_fallback.cpp
401 lines (373 loc) · 15.6 KB
/
autograd_not_implemented_fallback.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
#include <c10/util/irange.h>
#include <ATen/core/TorchDispatchUtils.h>
#include <ATen/core/dispatch/Dispatcher.h>
#include <ATen/core/ivalue.h>
#include <c10/core/impl/TorchDispatchModeTLS.h>
#include <torch/csrc/autograd/VariableTypeUtils.h>
#include <torch/csrc/autograd/autograd.h>
#include <torch/csrc/autograd/function.h>
#include <torch/csrc/autograd/functions/basic_ops.h>
#include <torch/csrc/autograd/functions/utils.h>
#include <utility>
#include <vector>
namespace torch {
namespace autograd {
namespace {
template <typename F>
void _foreach_tensor(
F fn,
torch::jit::Stack* stack,
size_t stack_start,
size_t size) {
// Enumerate over tensors in a stack, including ones in TensorLists
int idx_tensor = 0;
for (const auto idx_arg : c10::irange(size)) {
auto& ivalue = (*stack)[stack_start + idx_arg];
if (ivalue.isTensor()) { // true for optional tensor that has value
const auto& tensor = ivalue.toTensor();
fn(idx_tensor, idx_arg, tensor);
idx_tensor++;
} else if (ivalue.isTensorList()) {
for (const auto& iv : ivalue.toListRef()) {
const auto& tensor = iv.toTensor();
fn(idx_tensor, idx_arg, tensor);
idx_tensor++;
}
}
}
}
} // namespace
void autogradNotImplementedFallbackImpl(
const c10::OperatorHandle& op,
c10::DispatchKeySet dispatch_keys,
torch::jit::Stack* stack) {
// Mimics a subset of the logic of a VariableType NotImplemented kernel
// See gen_variable_type.py
const auto& schema = op.schema();
const auto& op_name = schema.operator_name().name;
const auto num_arguments = schema.arguments().size();
const auto num_returns = schema.returns().size();
const auto stack_start = stack->size() - num_arguments;
const bool grad_mode = GradMode::is_enabled();
std::vector<const at::Tensor*> tensors_requiring_grad_on_stack;
// Keep track of which outputs are output of in-place modification
// so we can rebase_history if necessary
std::vector<bool> is_inplace_output(num_returns, false);
bool any_is_inplace_output = false;
std::vector<bool> is_aliased_output(num_returns, false);
int aliased_output_idx = -1;
for (const auto i : c10::irange(num_returns)) {
if (schema.is_aliasing({c10::SchemaArgType::output, i})) {
if (schema.is_mutable({c10::SchemaArgType::output, i})) {
is_inplace_output[i] = true;
any_is_inplace_output = true;
} else {
TORCH_CHECK(
aliased_output_idx == -1,
"Expected only a single output in the operator schema to have a non-write alias annotation (i.e., 'Tensor(a)'). "
"Non-composite functions where multiple outputs are aliased with inputs aren't supported."
"Please rewrite your function as a composite function.");
aliased_output_idx = i;
}
is_aliased_output[i] = true;
}
}
int aliased_input_idx = -1;
for (const auto i : c10::irange(num_arguments)) {
if (schema.is_aliasing({c10::SchemaArgType::input, i}) &&
!schema.is_mutable({c10::SchemaArgType::input, i})) {
TORCH_CHECK(
aliased_input_idx == -1,
"Expected only a single input in the operator schema to have a non-write alias annotation (i.e., 'Tensor(a)'). "
"Non-composite functions where multiple inputs are aliased with outputs aren't supported. "
"Please rewrite your function as a composite function.");
aliased_input_idx = i;
}
}
size_t num_tensor_inputs = 0; // Only used for DEBUG-only checks
_foreach_tensor(
[&](size_t _, size_t idx_arg, const at::Tensor& t) {
if (grad_mode && t.requires_grad()) {
tensors_requiring_grad_on_stack.push_back(&t);
}
num_tensor_inputs++;
TORCH_CHECK_NOT_IMPLEMENTED(
!isFwGradDefined(t),
"Trying to use forward AD with ",
op_name,
" that does not support it.");
},
stack,
stack_start,
num_arguments);
const bool any_requires_grad = !tensors_requiring_grad_on_stack.empty();
_foreach_tensor(
[&](size_t _, size_t i, const at::Tensor& t) {
if (schema.is_mutable({c10::SchemaArgType::input, i})) {
check_inplace(t, any_requires_grad);
}
},
stack,
stack_start,
num_arguments);
std::shared_ptr<NotImplemented> grad_fn;
if (any_requires_grad) {
grad_fn = std::shared_ptr<NotImplemented>(
new NotImplemented(op_name), deleteNode);
grad_fn->set_next_edges(
collect_next_edges(tensors_requiring_grad_on_stack));
}
#ifndef NDEBUG
// See NOTE [ TensorImpl and Storage Pointer Sanity Checks ]
auto stack_args_copy =
std::vector<c10::IValue>(stack->begin() + stack_start, stack->end());
std::vector<c10::intrusive_ptr<c10::TensorImpl>> impl_saved;
impl_saved.reserve(num_tensor_inputs);
std::vector<c10::optional<c10::Storage>> storage_saved;
storage_saved.reserve(num_tensor_inputs);
_foreach_tensor(
[&](size_t idx, size_t _, const at::Tensor& t) {
storage_saved.push_back(
t.has_storage() ? c10::optional<c10::Storage>(t.storage())
: c10::nullopt);
impl_saved.push_back(t.getIntrusivePtr());
},
&stack_args_copy,
0,
num_arguments);
#endif
if (aliased_input_idx != -1 || any_is_inplace_output) {
at::AutoDispatchBelowAutograd guard;
op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack);
} else {
// If neither in-place nor view
at::AutoDispatchBelowADInplaceOrView guard;
op.redispatchBoxed(
dispatch_keys & c10::after_ADInplaceOrView_keyset, stack);
}
#ifndef NDEBUG
_foreach_tensor(
[&](size_t idx_tensor, size_t _, const at::Tensor& t) {
if (storage_saved.at(idx_tensor).has_value())
TORCH_INTERNAL_ASSERT(
storage_saved.at(idx_tensor).value().is_alias_of(t.storage()),
op_name);
if (impl_saved.at(idx_tensor))
TORCH_INTERNAL_ASSERT(
impl_saved.at(idx_tensor) == t.getIntrusivePtr(), op_name);
},
&stack_args_copy,
0,
num_arguments);
_foreach_tensor(
[&](size_t idx_tensor, size_t idx_ret, const at::Tensor& t) {
if (at::impl::tensor_has_dispatch(t) ||
at::impl::dispatch_mode_enabled())
return;
if (!is_inplace_output[idx_ret])
TORCH_INTERNAL_ASSERT(
t.use_count() <= 1, op_name); // Okay to return undefined tensor
// note(crcrpar): `_foreach_norm` returns a list of scalar Tensors and
// each Tensor shares a storage of a hidden, intermediate 1D Tensor
// created inside the CUDA implementation. This is because the
// reference implementation of nvidia/apex repo returns this 1D Tensor
// where each element represents the norm of corresponding input Tensor,
// here I want to return the same number of Tensors as the input
// TensorList, see https://github.com/pytorch/pytorch/issues/93940
if (!is_aliased_output[idx_ret] && t.has_storage() &&
op_name != "aten::_foreach_norm")
TORCH_INTERNAL_ASSERT(t.storage().use_count() == 1);
},
stack,
stack->size() - num_returns,
num_returns);
// There should be only a single base-view pair, make sure their storage is
// aliased.
if (aliased_input_idx != -1 && aliased_output_idx != -1) {
const c10::IValue& aliased_input_iv = stack_args_copy[aliased_input_idx];
const c10::IValue& aliased_output_iv =
(*stack)[stack->size() - num_returns + aliased_output_idx];
TORCH_INTERNAL_ASSERT(aliased_input_iv.isTensor(), op_name);
TORCH_INTERNAL_ASSERT(
aliased_output_iv.isTensor() || aliased_output_iv.isTensorList(),
op_name);
const at::Tensor& aliased_input = aliased_input_iv.toTensor();
if (aliased_input.has_storage()) {
if (aliased_output_iv.isTensor()) {
const at::Tensor& aliased_output = aliased_input_iv.toTensor();
TORCH_INTERNAL_ASSERT(
aliased_input.storage().is_alias_of(aliased_output.storage()),
op_name);
} else {
const auto aliased_output_vec = aliased_output_iv.toTensorVector();
for (const auto& aliased_output : aliased_output_vec) {
TORCH_INTERNAL_ASSERT(
aliased_input.storage().is_alias_of(aliased_output.storage()),
op_name);
}
}
}
}
#endif
if (any_requires_grad) {
_foreach_tensor(
[&](size_t idx_tensor, size_t idx_ret, const at::Tensor& t) {
if (isDifferentiableType(t.scalar_type())) {
if (is_inplace_output[idx_ret]) {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
rebase_history(const_cast<at::Tensor&>(t), grad_fn);
} else {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
set_history(const_cast<at::Tensor&>(t), grad_fn);
}
}
},
stack,
stack->size() - num_returns,
num_returns);
}
}
torch::CppFunction autogradNotImplementedFallback() {
return torch::CppFunction::makeFromBoxedFunction<
&autogradNotImplementedFallbackImpl>();
}
void autogradNotImplementedInplaceOrViewFallbackImpl(
const c10::OperatorHandle& op,
c10::DispatchKeySet dispatch_keys,
torch::jit::Stack* stack) {
// Mimics a subset of the logic from ADInplaceOrViewType kernel:
// - see gen_inplace_or_view_type.py
// - this should only be used with autogradNotImplementedFallback above
// - For more information see
// https://pytorch.org/tutorials/advanced/dispatcher
//
// NOTE [ Limitations of ADInplaceOrView boxed kernel ]
//
// This op should only be used with autogradNotImplementedFallback kernel
// because there is some logic we need specifically to enforce that even
// if we do in-place on view's created in this kernel, the proper "derivative
// is not implemented" error is still raised.
//
// Just like the codegened kernel, we try to enforce some things:
// - For views: we enforce that the view relationship is between the first
// input
// and the first output (which may be either Tensor or vec of Tensors
// - For inplace (TODO?): enforce that the same op cannot be both a view and
// inplace
// that is not allowed in the gen_inplace_or_view logic
const auto& schema = op.schema();
const auto& op_name = schema.operator_name().name;
const auto num_arguments = schema.arguments().size();
const auto num_returns = schema.returns().size();
const auto stack_start = stack->size() - num_arguments;
at::Tensor aliased_input;
int64_t aliased_output_idx = -1;
for (const auto i : c10::irange(num_returns)) {
if (schema.is_aliasing({c10::SchemaArgType::output, i}) &&
!schema.is_mutable({c10::SchemaArgType::output, i})) {
TORCH_CHECK(
aliased_output_idx == -1,
"Fallback ADInplaceOrView kernel expects only a single output in the operator schema to have a "
"non-write alias annotation (i.e., 'Tensor(a)'). "
"Non-composite functions where multiple outputs are aliased with inputs aren't supported."
"Please rewrite your function as a composite function.");
aliased_output_idx = i;
}
}
int64_t aliased_input_idx = -1;
for (const auto i : c10::irange(num_arguments)) {
if (schema.is_aliasing({c10::SchemaArgType::input, i}) &&
!schema.is_mutable({c10::SchemaArgType::input, i})) {
TORCH_CHECK(
aliased_input_idx == -1,
"Fallback ADInplaceOrView kernel expects only a single input in the operator schema to have a "
"non-write alias annotation (i.e., 'Tensor(a)'). "
"Non-composite functions where multiple inputs are aliased with outputs aren't supported. "
"Please rewrite your function as a composite function.");
aliased_input_idx = i;
const c10::IValue& aliased_input_iv =
(*stack)[stack_start + i]; // get a reference to an ivalue on the
// stack
TORCH_CHECK(aliased_input_iv.isTensor());
aliased_input =
aliased_input_iv.toTensor(); // TODO: Can we avoid saving this tensor
// and incurring the refcount bump?
}
}
// See NOTE [ Limitations of ADInplaceOrView boxed kernel ] above
TORCH_CHECK(
(aliased_input_idx == -1 && aliased_output_idx == -1) ||
(aliased_input_idx == 0 && aliased_output_idx == 0),
"Fallback ADInplaceOrView kernel can only create view relationships between the first "
"input and the first output (the output can be a vector of tensors). Please change the "
"order of your operator's parameters so that this is the case.");
const bool is_view = aliased_input_idx != -1;
{
at::AutoDispatchBelowADInplaceOrView guard;
op.redispatchBoxed(
dispatch_keys & c10::after_ADInplaceOrView_keyset, stack);
}
for (const auto i : c10::irange(num_returns)) {
if (schema.is_mutable({c10::SchemaArgType::output, i})) {
increment_version((*stack)[stack->size() - num_returns + i].toTensor());
}
}
if (is_view) {
c10::IValue& aliased_output_iv =
(*stack)[stack->size() - num_returns + aliased_output_idx];
if (aliased_output_iv.isTensorList()) {
auto aliased_output = aliased_output_iv.toTensorVector();
// Only allow rebasing of the history if we return a single Tensor that is
// why we don't have to care about the view_func logic below.
// See NOTE [ View + Inplace detection ] for more details about this logic
auto result = as_view(
/* base=*/aliased_input,
/* tensors=*/aliased_output,
/* is_bw_differentiable=*/true,
/* is_fw_differentiable=*/true,
/* creation_meta=*/
InferenceMode::is_enabled()
? CreationMeta::INFERENCE_MODE
: (at::GradMode::is_enabled() ? CreationMeta::MULTI_OUTPUT_NODE
: CreationMeta::NO_GRAD_MODE));
// ^ pass in creation meta unnecessarily even if not isDifferentiableType,
// but we don't have that
// information here anyway.
stack->at(stack->size() - num_returns + aliased_output_idx) = result;
} else {
TORCH_CHECK(aliased_output_iv.isTensor());
auto result = as_view(
/* base=*/aliased_input,
/* tensor=*/std::move(aliased_output_iv).toTensor(),
/* is_bw_differentiable=*/true,
/* is_fw_differentiable=*/true,
/* view_func=*/
[op_name = op_name](const at::Tensor&) {
// We always need this view_func because otherwise if we do in-place
// on this view, we would implicitly use AsStridedBackward instead
// of the NotImplemented node. For the cross-dtype/non-strided
// cases, we would create something like this anyway
TORCH_CHECK(
false,
"Mutating the view ",
op_name,
" which does not have a derivative implemented is forbidden.");
return at::Tensor();
},
/* creation_meta=*/
InferenceMode::is_enabled()
? CreationMeta::INFERENCE_MODE
: (at::GradMode::is_enabled() ? CreationMeta::DEFAULT
: CreationMeta::NO_GRAD_MODE));
stack->at(stack->size() - num_returns + aliased_output_idx) =
std::move(result);
}
}
}
torch::CppFunction autogradNotImplementedInplaceOrViewFallback() {
return torch::CppFunction::makeFromBoxedFunction<
&autogradNotImplementedInplaceOrViewFallbackImpl>();
}
} // namespace autograd
} // namespace torch